linux-stable/mm/mempolicy.c

3525 lines
89 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
* Support four policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
* weighted interleave
* Allocate memory interleaved over a set of nodes based on
* a set of weights (per-node), with normal fallback if it
* fails. Otherwise operates the same as interleave.
* Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
* on node 0 for every 1 page allocated on node 1.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
2023-10-03 09:20:14 +00:00
* preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
* preferred many Try a set of nodes first before normal fallback. This is
* similar to preferred without the special case.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
* try to allocate on the local CPU. The VMA policy is only applied for memory
* allocations for a VMA in the VM.
*
* Currently there are a few corner cases in swapping where the policy
* is not applied, but the majority should be handled. When process policy
* is used it is not remembered over swap outs/swap ins.
*
* Only the highest zone in the zone hierarchy gets policied. Allocations
* requesting a lower zone just use default policy. This implies that
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
2023-10-03 09:20:14 +00:00
* For shmem/tmpfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
/* Notebook:
fix mmap readahead to honour policy and enable policy for any page cache
object
statistics for bigpages
global policy for page cache? currently it uses process policy. Requires
first item above.
handle mremap for shared memory (currently ignored for the policy)
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
ksm: memory hotremove migration only The previous patch enables page migration of ksm pages, but that soon gets into trouble: not surprising, since we're using the ksm page lock to lock operations on its stable_node, but page migration switches the page whose lock is to be used for that. Another layer of locking would fix it, but do we need that yet? Do we actually need page migration of ksm pages? Yes, memory hotremove needs to offline sections of memory: and since we stopped allocating ksm pages with GFP_HIGHUSER, they will tend to be GFP_HIGHUSER_MOVABLE candidates for migration. But KSM is currently unconscious of NUMA issues, happily merging pages from different NUMA nodes: at present the rule must be, not to use MADV_MERGEABLE where you care about NUMA. So no, NUMA page migration of ksm pages does not make sense yet. So, to complete support for ksm swapping we need to make hotremove safe. ksm_memory_callback() take ksm_thread_mutex when MEM_GOING_OFFLINE and release it when MEM_OFFLINE or MEM_CANCEL_OFFLINE. But if mapped pages are freed before migration reaches them, stable_nodes may be left still pointing to struct pages which have been removed from the system: the stable_node needs to identify a page by pfn rather than page pointer, then it can safely prune them when MEM_OFFLINE. And make NUMA migration skip PageKsm pages where it skips PageReserved. But it's only when we reach unmap_and_move() that the page lock is taken and we can be sure that raised pagecount has prevented a PageAnon from being upgraded: so add offlining arg to migrate_pages(), to migrate ksm page when offlining (has sufficient locking) but reject it otherwise. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Izik Eidus <ieidus@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Chris Wright <chrisw@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:59:33 +00:00
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit <namit@vmware.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yu Zhao <yuzhao@google.com> Cc: Nick Piggin <npiggin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-10 01:20:50 +00:00
#include <asm/tlb.h>
#include <linux/uaccess.h>
vmscan: move isolate_lru_page() to vmscan.c On large memory systems, the VM can spend way too much time scanning through pages that it cannot (or should not) evict from memory. Not only does it use up CPU time, but it also provokes lock contention and can leave large systems under memory presure in a catatonic state. This patch series improves VM scalability by: 1) putting filesystem backed, swap backed and unevictable pages onto their own LRUs, so the system only scans the pages that it can/should evict from memory 2) switching to two handed clock replacement for the anonymous LRUs, so the number of pages that need to be scanned when the system starts swapping is bound to a reasonable number 3) keeping unevictable pages off the LRU completely, so the VM does not waste CPU time scanning them. ramfs, ramdisk, SHM_LOCKED shared memory segments and mlock()ed VMA pages are keept on the unevictable list. This patch: isolate_lru_page logically belongs to be in vmscan.c than migrate.c. It is tough, because we don't need that function without memory migration so there is a valid argument to have it in migrate.c. However a subsequent patch needs to make use of it in the core mm, so we can happily move it to vmscan.c. Also, make the function a little more generic by not requiring that it adds an isolated page to a given list. Callers can do that. Note that we now have '__isolate_lru_page()', that does something quite different, visible outside of vmscan.c for use with memory controller. Methinks we need to rationalize these names/purposes. --lts [akpm@linux-foundation.org: fix mm/memory_hotplug.c build] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 03:26:09 +00:00
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
[PATCH] optional ZONE_DMA: deal with cases of ZONE_DMA meaning the first zone This patchset follows up on the earlier work in Andrew's tree to reduce the number of zones. The patches allow to go to a minimum of 2 zones. This one allows also to make ZONE_DMA optional and therefore the number of zones can be reduced to one. ZONE_DMA is usually used for ISA DMA devices. There are a number of reasons why we would not want to have ZONE_DMA 1. Some arches do not need ZONE_DMA at all. 2. With the advent of IOMMUs DMA zones are no longer needed. The necessity of DMA zones may drastically be reduced in the future. This patchset allows a compilation of a kernel without that overhead. 3. Devices that require ISA DMA get rare these days. All my systems do not have any need for ISA DMA. 4. The presence of an additional zone unecessarily complicates VM operations because it must be scanned and balancing logic must operate on its. 5. With only ZONE_NORMAL one can reach the situation where we have only one zone. This will allow the unrolling of many loops in the VM and allows the optimization of varous code paths in the VM. 6. Having only a single zone in a NUMA system results in a 1-1 correspondence between nodes and zones. Various additional optimizations to critical VM paths become possible. Many systems today can operate just fine with a single zone. If you look at what is in ZONE_DMA then one usually sees that nothing uses it. The DMA slabs are empty (Some arches use ZONE_DMA instead of ZONE_NORMAL, then ZONE_NORMAL will be empty instead). On all of my systems (i386, x86_64, ia64) ZONE_DMA is completely empty. Why constantly look at an empty zone in /proc/zoneinfo and empty slab in /proc/slabinfo? Non i386 also frequently have no need for ZONE_DMA and zones stay empty. The patchset was tested on i386 (UP / SMP), x86_64 (UP, NUMA) and ia64 (NUMA). The RFC posted earlier (see http://marc.theaimsgroup.com/?l=linux-kernel&m=115231723513008&w=2) had lots of #ifdefs in them. An effort has been made to minize the number of #ifdefs and make this as compact as possible. The job was made much easier by the ongoing efforts of others to extract common arch specific functionality. I have been running this for awhile now on my desktop and finally Linux is using all my available RAM instead of leaving the 16MB in ZONE_DMA untouched: christoph@pentium940:~$ cat /proc/zoneinfo Node 0, zone Normal pages free 4435 min 1448 low 1810 high 2172 active 241786 inactive 210170 scanned 0 (a: 0 i: 0) spanned 524224 present 524224 nr_anon_pages 61680 nr_mapped 14271 nr_file_pages 390264 nr_slab_reclaimable 27564 nr_slab_unreclaimable 1793 nr_page_table_pages 449 nr_dirty 39 nr_writeback 0 nr_unstable 0 nr_bounce 0 cpu: 0 pcp: 0 count: 156 high: 186 batch: 31 cpu: 0 pcp: 1 count: 9 high: 62 batch: 15 vm stats threshold: 20 cpu: 1 pcp: 0 count: 177 high: 186 batch: 31 cpu: 1 pcp: 1 count: 12 high: 62 batch: 15 vm stats threshold: 20 all_unreclaimable: 0 prev_priority: 12 temp_priority: 12 start_pfn: 0 This patch: In two places in the VM we use ZONE_DMA to refer to the first zone. If ZONE_DMA is optional then other zones may be first. So simply replace ZONE_DMA with zone 0. This also fixes ZONETABLE_PGSHIFT. If we have only a single zone then ZONES_PGSHIFT may become 0 because there is no need anymore to encode the zone number related to a pgdat. However, we still need a zonetable to index all the zones for each node if this is a NUMA system. Therefore define ZONETABLE_SHIFT unconditionally as the offset of the ZONE field in page flags. [apw@shadowen.org: fix mismerge] Acked-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@suse.de> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Matthew Wilcox <willy@debian.org> Cc: James Bottomley <James.Bottomley@steeleye.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 09:43:07 +00:00
enum zone_type policy_zone = 0;
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
/*
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
.mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
mm/mempolicy: implement the sysfs-based weighted_interleave interface Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs extension", v5. Weighted interleave is a new interleave policy intended to make use of heterogeneous memory environments appearing with CXL. The existing interleave mechanism does an even round-robin distribution of memory across all nodes in a nodemask, while weighted interleave distributes memory across nodes according to a provided weight. (Weight = # of page allocations per round) Weighted interleave is intended to reduce average latency when bandwidth is pressured - therefore increasing total throughput. In other words: It allows greater use of the total available bandwidth in a heterogeneous hardware environment (different hardware provides different bandwidth capacity). As bandwidth is pressured, latency increases - first linearly and then exponentially. By keeping bandwidth usage distributed according to available bandwidth, we therefore can reduce the average latency of a cacheline fetch. A good explanation of the bandwidth vs latency response curve: https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/ From the article: ``` Constant region: The latency response is fairly constant for the first 40% of the sustained bandwidth. Linear region: In between 40% to 80% of the sustained bandwidth, the latency response increases almost linearly with the bandwidth demand of the system due to contention overhead by numerous memory requests. Exponential region: Between 80% to 100% of the sustained bandwidth, the memory latency is dominated by the contention latency which can be as much as twice the idle latency or more. Maximum sustained bandwidth : Is 65% to 75% of the theoretical maximum bandwidth. ``` As a general rule of thumb: * If bandwidth usage is low, latency does not increase. It is optimal to place data in the nearest (lowest latency) device. * If bandwidth usage is high, latency increases. It is optimal to place data such that bandwidth use is optimized per-device. This is the top line goal: Provide a user a mechanism to target using the "maximum sustained bandwidth" of each hardware component in a heterogenous memory system. For example, the stream benchmark demonstrates that 1:1 (default) interleave is actively harmful, while weighted interleave can be beneficial. Default interleave distributes data such that too much pressure is placed on devices with lower available bandwidth. Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device) Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) Targeted weights : +2.5% to +4% (consistently better than DRAM) Global means the task-policy was set (set_mempolicy), while targeted means VMA policies were set (mbind2). We see weighted interleave is not always beneficial when applied globally, but is always beneficial when applied to bandwidth-driving memory regions. There are 4 patches in this set: 1) Implement system-global interleave weights as sysfs extension in mm/mempolicy.c. These weights are RCU protected, and a default weight set is provided (all weights are 1 by default). In future work, we intend to expose an interface for HMAT/CDAT code to set reasonable default values based on the memory configuration of the system discovered at boot/hotplug. 2) A mild refactor of some interleave-logic for re-use in the new weighted interleave logic. 3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind 4) Protect interleave logic (weighted and normal) with the mems_allowed seq cookie. If the nodemask changes while accessing it during a rebind, just retry the access. Included below are some performance and LTP test information, and a sample numactl branch which can be used for testing. = Performance summary = (tests may have different configurations, see extended info below) 1) MLC (W2) : +38% over DRAM. +264% over default interleave. MLC (W5) : +40% over DRAM. +226% over default interleave. 2) Stream : -6% to +4% over DRAM, +430% over default interleave. 3) XSBench : +19% over DRAM. +47% over default interleave. = LTP Testing Summary = existing mempolicy & mbind tests: pass mempolicy & mbind + weighted interleave (global weights): pass = version history v5: - style fixes - mems_allowed cookie protection to detect rebind issues, prevents spurious allocation failures and/or mis-allocations - sparse warning fixes related to __rcu on local variables ===================================================================== Performance tests - MLC From - Ravi Jonnalagadda <ravis.opensrc@micron.com> Hardware: Single-socket, multiple CXL memory expanders. Workload: W2 Data Signature: 2:1 read:write DRAM only bandwidth (GBps): 298.8 DRAM + CXL (default interleave) (GBps): 113.04 DRAM + CXL (weighted interleave)(GBps): 412.5 Gain over DRAM only: 1.38x Gain over default interleave: 2.64x Workload: W5 Data Signature: 1:1 read:write DRAM only bandwidth (GBps): 273.2 DRAM + CXL (default interleave) (GBps): 117.23 DRAM + CXL (weighted interleave)(GBps): 382.7 Gain over DRAM only: 1.4x Gain over default interleave: 2.26x ===================================================================== Performance test - Stream From - Gregory Price <gregory.price@memverge.com> Hardware: Single socket, single CXL expander numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) mbind2 weights : +2.5% to +4% (consistently better than DRAM) dram only: numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Function Direction BestRateMBs AvgTime MinTime MaxTime Copy: 0->0 200923.2 0.032662 0.031853 0.033301 Scale: 0->0 202123.0 0.032526 0.031664 0.032970 Add: 0->0 208873.2 0.047322 0.045961 0.047884 Triad: 0->0 208523.8 0.047262 0.046038 0.048414 CXL-only: numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 22209.7 0.288661 0.288162 0.289342 Scale: 0->0 22288.2 0.287549 0.287147 0.288291 Add: 0->0 24419.1 0.393372 0.393135 0.393735 Triad: 0->0 24484.6 0.392337 0.392083 0.394331 Based on the above, the optimal weights are ~9:1 echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1 echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2 default interleave: numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 44666.2 0.143671 0.143285 0.144174 Scale: 0->0 44781.6 0.143256 0.142916 0.143713 Add: 0->0 48600.7 0.197719 0.197528 0.197858 Triad: 0->0 48727.5 0.197204 0.197014 0.197439 global weighted interleave: numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 190085.9 0.034289 0.033669 0.034645 Scale: 0->0 207677.4 0.031909 0.030817 0.033061 Add: 0->0 202036.8 0.048737 0.047516 0.053409 Triad: 0->0 217671.5 0.045819 0.044103 0.046755 targted regions w/ global weights (modified stream to mbind2 malloc'd regions)) numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc Copy: 0->0 205827.0 0.031445 0.031094 0.031984 Scale: 0->0 208171.8 0.031320 0.030744 0.032505 Add: 0->0 217352.0 0.045087 0.044168 0.046515 Triad: 0->0 216884.8 0.045062 0.044263 0.046982 ===================================================================== Performance tests - XSBench From - Hyeongtak Ji <hyeongtak.ji@sk.com> Hardware: Single socket, Single CXL memory Expander NUMA node 0: 56 logical cores, 128 GB memory NUMA node 2: 96 GB CXL memory Threads: 56 Lookups: 170,000,000 Summary: +19% over DRAM. +47% over default interleave. Performance tests - XSBench 1. dram only $ numactl -m 0 ./XSBench -s XL –p 5000000 Runtime: 36.235 seconds Lookups/s: 4,691,618 2. default interleave $ numactl –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 55.243 seconds Lookups/s: 3,077,293 3. weighted interleave numactl –w –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 29.262 seconds Lookups/s: 5,809,513 ===================================================================== LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2 = Existing tests set_mempolicy, get_mempolicy, mbind MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality but did not adjust tests for weighting. Basically the weights were set to 1, which is the default, and it should behave the same as MPOL_INTERLEAVE if logic is correct. == set_mempolicy01 : passed 18, failed 0 == set_mempolicy02 : passed 10, failed 0 == set_mempolicy03 : passed 64, failed 0 == set_mempolicy04 : passed 32, failed 0 == set_mempolicy05 - n/a on non-x86 == set_mempolicy06 : passed 10, failed 0 this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE == set_mempolicy07 : passed 32, failed 0 set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE == get_mempolicy01 : passed 12, failed 0 change: added MPOL_WEIGHTED_INTERLEAVE == get_mempolicy02 : passed 2, failed 0 == mbind01 : passed 15, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind02 : passed 4, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind03 : passed 16, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind04 : passed 48, failed 0 added MPOL_WEIGHTED_INTERLEAVE ===================================================================== numactl (set_mempolicy) w/ global weighting test numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master command: numactl -w --interleave=0,1 ./eatmem result (weights 1:1): 0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4 7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4 50% distribution is correct result (weights 5:1): 01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4 7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4 16.666% distribution is correct result (weights 1:5): 01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4 7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4 16.666% distribution is correct #include <stdio.h> #include <stdlib.h> #include <string.h> int main (void) { char* mem = malloc(1024*1024*256); memset(mem, 1, 1024*1024*256); for (int i = 0; i < ((1024*1024*256)/4096); i++) { mem = malloc(4096); mem[0] = 1; } printf("done\n"); getchar(); return 0; } This patch (of 4): This patch provides a way to set interleave weight information under sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN The sysfs structure is designed as follows. $ tree /sys/kernel/mm/mempolicy/ /sys/kernel/mm/mempolicy/ [1] └── weighted_interleave [2] ├── node0 [3] └── node1 Each file above can be explained as follows. [1] mm/mempolicy: configuration interface for mempolicy subsystem [2] weighted_interleave/: config interface for weighted interleave policy [3] weighted_interleave/nodeN: weight for nodeN If a node value is set to `0`, the system-default value will be used. As of this patch, the system-default for all nodes is always 1. Link: https://lkml.kernel.org/r/20240202170238.90004-1-gregory.price@memverge.com Link: https://lkml.kernel.org/r/20240202170238.90004-2-gregory.price@memverge.com Suggested-by: "Huang, Ying" <ying.huang@intel.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Gregory Price <gregory.price@memverge.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Gregory Price <gourry.memverge@gmail.com> Cc: Hasan Al Maruf <Hasan.Maruf@amd.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:35 +00:00
/*
* iw_table is the sysfs-set interleave weight table, a value of 0 denotes
* system-default value should be used. A NULL iw_table also denotes that
* system-default values should be used. Until the system-default table
* is implemented, the system-default is always 1.
*
* iw_table is RCU protected
*/
static u8 __rcu *iw_table;
static DEFINE_MUTEX(iw_table_lock);
static u8 get_il_weight(int node)
{
u8 *table;
u8 weight;
rcu_read_lock();
table = rcu_dereference(iw_table);
/* if no iw_table, use system default */
weight = table ? table[node] : 1;
/* if value in iw_table is 0, use system default */
weight = weight ? weight : 1;
rcu_read_unlock();
return weight;
}
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
* @state: State to filter the search
*
* Lookup the closest node by distance if @nid is not in state.
*
* Return: this @node if it is in state, otherwise the closest node by distance
*/
int numa_nearest_node(int node, unsigned int state)
{
int min_dist = INT_MAX, dist, n, min_node;
if (state >= NR_NODE_STATES)
return -EINVAL;
if (node == NUMA_NO_NODE || node_state(node, state))
return node;
min_node = node;
for_each_node_state(n, state) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
min_node = n;
}
}
return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (pol)
return pol;
node = numa_node_id();
if (node != NUMA_NO_NODE) {
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (pol->mode)
return pol;
}
return &default_policy;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
}
static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
if (nodes_empty(*nodes))
return -EINVAL;
nodes_clear(pol->nodes);
node_set(first_node(*nodes), pol->nodes);
return 0;
}
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
* parameter with respect to the policy mode and flags.
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
*/
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
{
int ret;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
/*
* Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
* constructor.
*/
if (!pol || pol->mode == MPOL_LOCAL)
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
return 0;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
/* Check N_MEMORY */
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
VM_BUG_ON(!nodes);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
else
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
2023-10-03 09:20:14 +00:00
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
policy->home_node = NUMA_NO_NODE;
return policy;
}
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
/* Slow path of a mpol destructor. */
2023-10-03 09:20:14 +00:00
void __mpol_put(struct mempolicy *pol)
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
{
2023-10-03 09:20:14 +00:00
if (!atomic_dec_and_test(&pol->refcnt))
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
return;
2023-10-03 09:20:14 +00:00
kmem_cache_free(policy_cache, pol);
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
}
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
*nodes);
mm/mempolicy.c: fix an incorrect rebind node in mpol_rebind_nodemask mpol_rebind_nodemask() is called for MPOL_BIND and MPOL_INTERLEAVE mempoclicies when the tasks's cpuset's mems_allowed changes. For policies created without MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES, it works by remapping the policy's allowed nodes (stored in v.nodes) using the previous value of mems_allowed (stored in w.cpuset_mems_allowed) as the domain of map and the new mems_allowed (passed as nodes) as the range of the map (see the comment of bitmap_remap() for details). The result of remapping is stored back as policy's nodemask in v.nodes, and the new value of mems_allowed should be stored in w.cpuset_mems_allowed to facilitate the next rebind, if it happens. However, 213980c0f23b ("mm, mempolicy: simplify rebinding mempolicies when updating cpusets") introduced a bug where the result of remapping is stored in w.cpuset_mems_allowed instead. Thus, a mempolicy's allowed nodes can evolve in an unexpected way after a series of rebinding due to cpuset mems_allowed changes, possibly binding to a wrong node or a smaller number of nodes which may e.g. overload them. This patch fixes the bug so rebinding again works as intended. [vbabka@suse.cz: new changlog] Link: http://lkml.kernel.org/r/ef6a69c6-c052-b067-8f2c-9d615c619bb9@suse.cz Link: http://lkml.kernel.org/r/1558768043-23184-1-git-send-email-zhongjiang@huawei.com Fixes: 213980c0f23b ("mm, mempolicy: simplify rebinding mempolicies when updating cpusets") Signed-off-by: zhong jiang <zhongjiang@huawei.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Oscar Salvador <osalvador@suse.de> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-06-28 19:06:43 +00:00
pol->w.cpuset_mems_allowed = *nodes;
}
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
if (nodes_empty(tmp))
tmp = *nodes;
pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
const nodemask_t *nodes)
{
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
pol->w.cpuset_mems_allowed = *nodes;
}
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
* Per-vma policies are protected by mmap_lock. Allocations using per-task
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
*/
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
mm/mempolicy: fix uninit-value in mpol_rebind_policy() mpol_set_nodemask()(mm/mempolicy.c) does not set up nodemask when pol->mode is MPOL_LOCAL. Check pol->mode before access pol->w.cpuset_mems_allowed in mpol_rebind_policy()(mm/mempolicy.c). BUG: KMSAN: uninit-value in mpol_rebind_policy mm/mempolicy.c:352 [inline] BUG: KMSAN: uninit-value in mpol_rebind_task+0x2ac/0x2c0 mm/mempolicy.c:368 mpol_rebind_policy mm/mempolicy.c:352 [inline] mpol_rebind_task+0x2ac/0x2c0 mm/mempolicy.c:368 cpuset_change_task_nodemask kernel/cgroup/cpuset.c:1711 [inline] cpuset_attach+0x787/0x15e0 kernel/cgroup/cpuset.c:2278 cgroup_migrate_execute+0x1023/0x1d20 kernel/cgroup/cgroup.c:2515 cgroup_migrate kernel/cgroup/cgroup.c:2771 [inline] cgroup_attach_task+0x540/0x8b0 kernel/cgroup/cgroup.c:2804 __cgroup1_procs_write+0x5cc/0x7a0 kernel/cgroup/cgroup-v1.c:520 cgroup1_tasks_write+0x94/0xb0 kernel/cgroup/cgroup-v1.c:539 cgroup_file_write+0x4c2/0x9e0 kernel/cgroup/cgroup.c:3852 kernfs_fop_write_iter+0x66a/0x9f0 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2162 [inline] new_sync_write fs/read_write.c:503 [inline] vfs_write+0x1318/0x2030 fs/read_write.c:590 ksys_write+0x28b/0x510 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0xdb/0x120 fs/read_write.c:652 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_post_alloc_hook mm/slab.h:524 [inline] slab_alloc_node mm/slub.c:3251 [inline] slab_alloc mm/slub.c:3259 [inline] kmem_cache_alloc+0x902/0x11c0 mm/slub.c:3264 mpol_new mm/mempolicy.c:293 [inline] do_set_mempolicy+0x421/0xb70 mm/mempolicy.c:853 kernel_set_mempolicy mm/mempolicy.c:1504 [inline] __do_sys_set_mempolicy mm/mempolicy.c:1510 [inline] __se_sys_set_mempolicy+0x44c/0xb60 mm/mempolicy.c:1507 __x64_sys_set_mempolicy+0xd8/0x110 mm/mempolicy.c:1507 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae KMSAN: uninit-value in mpol_rebind_task (2) https://syzkaller.appspot.com/bug?id=d6eb90f952c2a5de9ea718a1b873c55cb13b59dc This patch seems to fix below bug too. KMSAN: uninit-value in mpol_rebind_mm (2) https://syzkaller.appspot.com/bug?id=f2fecd0d7013f54ec4162f60743a2b28df40926b The uninit-value is pol->w.cpuset_mems_allowed in mpol_rebind_policy(). When syzkaller reproducer runs to the beginning of mpol_new(), mpol_new() mm/mempolicy.c do_mbind() mm/mempolicy.c kernel_mbind() mm/mempolicy.c `mode` is 1(MPOL_PREFERRED), nodes_empty(*nodes) is `true` and `flags` is 0. Then mode = MPOL_LOCAL; ... policy->mode = mode; policy->flags = flags; will be executed. So in mpol_set_nodemask(), mpol_set_nodemask() mm/mempolicy.c do_mbind() kernel_mbind() pol->mode is 4 (MPOL_LOCAL), that `nodemask` in `pol` is not initialized, which will be accessed in mpol_rebind_policy(). Link: https://lkml.kernel.org/r/20220512123428.fq3wofedp6oiotd4@ppc.localdomain Signed-off-by: Wang Cheng <wanngchenng@gmail.com> Reported-by: <syzbot+217f792c92599518a2ab@syzkaller.appspotmail.com> Tested-by: <syzbot+217f792c92599518a2ab@syzkaller.appspotmail.com> Cc: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-19 21:08:54 +00:00
if (!pol || pol->mode == MPOL_LOCAL)
return;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
*
* Called with task's alloc_lock held.
*/
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
* Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
vma_start_write(vma);
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
mpol_rebind_policy(vma->vm_policy, new);
}
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
[MPOL_LOCAL] = {
.rebind = mpol_rebind_default,
},
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
[MPOL_PREFERRED_MANY] = {
.create = mpol_new_nodemask,
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
.rebind = mpol_rebind_preferred,
},
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
[MPOL_WEIGHTED_INTERLEAVE] = {
.create = mpol_new_nodemask,
.rebind = mpol_rebind_nodemask,
},
};
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags);
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
pgoff_t ilx, int *nid);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static bool strictly_unmovable(unsigned long flags)
{
/*
* STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
* if any misplaced page is found.
*/
return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
MPOL_MF_STRICT;
}
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
struct migration_mpol { /* for alloc_migration_target_by_mpol() */
struct mempolicy *pol;
pgoff_t ilx;
};
struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
struct folio *large; /* note last large folio encountered */
long nr_failed; /* could not be isolated at this time */
};
mm: mempolicy: add queue_pages_required() Patch series "mm: page migration enhancement for thp", v9. Motivations: 1. THP migration becomes important in the upcoming heterogeneous memory systems. As David Nellans from NVIDIA pointed out from other threads (http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1349227.html), future GPUs or other accelerators will have their memory managed by operating systems. Moving data into and out of these memory nodes efficiently is critical to applications that use GPUs or other accelerators. Existing page migration only supports base pages, which has a very low memory bandwidth utilization. My experiments (see below) show THP migration can migrate pages more efficiently. 2. Base page migration vs THP migration throughput. Here are cross-socket page migration results from calling move_pages() syscall: In x86_64, a Intel two-socket E5-2640v3 box, - single 4KB base page migration takes 62.47 us, using 0.06 GB/s BW, - single 2MB THP migration takes 658.54 us, using 2.97 GB/s BW, - 512 4KB base page migration takes 1987.38 us, using 0.98 GB/s BW. In ppc64, a two-socket Power8 box, - single 64KB base page migration takes 49.3 us, using 1.24 GB/s BW, - single 16MB THP migration takes 2202.17 us, using 7.10 GB/s BW, - 256 64KB base page migration takes 2543.65 us, using 6.14 GB/s BW. THP migration can give us 3x and 1.15x throughput over base page migration in x86_64 and ppc64 respectivley. You can test it out by using the code here: https://github.com/x-y-z/thp-migration-bench 3. Existing page migration splits THP before migration and cannot guarantee the migrated pages are still contiguous. Contiguity is always what GPUs and accelerators look for. Without THP migration, khugepaged needs to do extra work to reassemble the migrated pages back to THPs. This patch (of 10): Introduce a separate check routine related to MPOL_MF_INVERT flag. This patch just does cleanup, no behavioral change. Link: http://lkml.kernel.org/r/20170717193955.20207-2-zi.yan@sent.com Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-09-08 23:10:42 +00:00
/*
* Check if the folio's nid is in qp->nmask.
mm: mempolicy: add queue_pages_required() Patch series "mm: page migration enhancement for thp", v9. Motivations: 1. THP migration becomes important in the upcoming heterogeneous memory systems. As David Nellans from NVIDIA pointed out from other threads (http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1349227.html), future GPUs or other accelerators will have their memory managed by operating systems. Moving data into and out of these memory nodes efficiently is critical to applications that use GPUs or other accelerators. Existing page migration only supports base pages, which has a very low memory bandwidth utilization. My experiments (see below) show THP migration can migrate pages more efficiently. 2. Base page migration vs THP migration throughput. Here are cross-socket page migration results from calling move_pages() syscall: In x86_64, a Intel two-socket E5-2640v3 box, - single 4KB base page migration takes 62.47 us, using 0.06 GB/s BW, - single 2MB THP migration takes 658.54 us, using 2.97 GB/s BW, - 512 4KB base page migration takes 1987.38 us, using 0.98 GB/s BW. In ppc64, a two-socket Power8 box, - single 64KB base page migration takes 49.3 us, using 1.24 GB/s BW, - single 16MB THP migration takes 2202.17 us, using 7.10 GB/s BW, - 256 64KB base page migration takes 2543.65 us, using 6.14 GB/s BW. THP migration can give us 3x and 1.15x throughput over base page migration in x86_64 and ppc64 respectivley. You can test it out by using the code here: https://github.com/x-y-z/thp-migration-bench 3. Existing page migration splits THP before migration and cannot guarantee the migrated pages are still contiguous. Contiguity is always what GPUs and accelerators look for. Without THP migration, khugepaged needs to do extra work to reassemble the migrated pages back to THPs. This patch (of 10): Introduce a separate check routine related to MPOL_MF_INVERT flag. This patch just does cleanup, no behavioral change. Link: http://lkml.kernel.org/r/20170717193955.20207-2-zi.yan@sent.com Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-09-08 23:10:42 +00:00
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
static inline bool queue_folio_required(struct folio *folio,
mm: mempolicy: add queue_pages_required() Patch series "mm: page migration enhancement for thp", v9. Motivations: 1. THP migration becomes important in the upcoming heterogeneous memory systems. As David Nellans from NVIDIA pointed out from other threads (http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1349227.html), future GPUs or other accelerators will have their memory managed by operating systems. Moving data into and out of these memory nodes efficiently is critical to applications that use GPUs or other accelerators. Existing page migration only supports base pages, which has a very low memory bandwidth utilization. My experiments (see below) show THP migration can migrate pages more efficiently. 2. Base page migration vs THP migration throughput. Here are cross-socket page migration results from calling move_pages() syscall: In x86_64, a Intel two-socket E5-2640v3 box, - single 4KB base page migration takes 62.47 us, using 0.06 GB/s BW, - single 2MB THP migration takes 658.54 us, using 2.97 GB/s BW, - 512 4KB base page migration takes 1987.38 us, using 0.98 GB/s BW. In ppc64, a two-socket Power8 box, - single 64KB base page migration takes 49.3 us, using 1.24 GB/s BW, - single 16MB THP migration takes 2202.17 us, using 7.10 GB/s BW, - 256 64KB base page migration takes 2543.65 us, using 6.14 GB/s BW. THP migration can give us 3x and 1.15x throughput over base page migration in x86_64 and ppc64 respectivley. You can test it out by using the code here: https://github.com/x-y-z/thp-migration-bench 3. Existing page migration splits THP before migration and cannot guarantee the migrated pages are still contiguous. Contiguity is always what GPUs and accelerators look for. Without THP migration, khugepaged needs to do extra work to reassemble the migrated pages back to THPs. This patch (of 10): Introduce a separate check routine related to MPOL_MF_INVERT flag. This patch just does cleanup, no behavioral change. Link: http://lkml.kernel.org/r/20170717193955.20207-2-zi.yan@sent.com Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-09-08 23:10:42 +00:00
struct queue_pages *qp)
{
int nid = folio_nid(folio);
mm: mempolicy: add queue_pages_required() Patch series "mm: page migration enhancement for thp", v9. Motivations: 1. THP migration becomes important in the upcoming heterogeneous memory systems. As David Nellans from NVIDIA pointed out from other threads (http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1349227.html), future GPUs or other accelerators will have their memory managed by operating systems. Moving data into and out of these memory nodes efficiently is critical to applications that use GPUs or other accelerators. Existing page migration only supports base pages, which has a very low memory bandwidth utilization. My experiments (see below) show THP migration can migrate pages more efficiently. 2. Base page migration vs THP migration throughput. Here are cross-socket page migration results from calling move_pages() syscall: In x86_64, a Intel two-socket E5-2640v3 box, - single 4KB base page migration takes 62.47 us, using 0.06 GB/s BW, - single 2MB THP migration takes 658.54 us, using 2.97 GB/s BW, - 512 4KB base page migration takes 1987.38 us, using 0.98 GB/s BW. In ppc64, a two-socket Power8 box, - single 64KB base page migration takes 49.3 us, using 1.24 GB/s BW, - single 16MB THP migration takes 2202.17 us, using 7.10 GB/s BW, - 256 64KB base page migration takes 2543.65 us, using 6.14 GB/s BW. THP migration can give us 3x and 1.15x throughput over base page migration in x86_64 and ppc64 respectivley. You can test it out by using the code here: https://github.com/x-y-z/thp-migration-bench 3. Existing page migration splits THP before migration and cannot guarantee the migrated pages are still contiguous. Contiguity is always what GPUs and accelerators look for. Without THP migration, khugepaged needs to do extra work to reassemble the migrated pages back to THPs. This patch (of 10): Introduce a separate check routine related to MPOL_MF_INVERT flag. This patch just does cleanup, no behavioral change. Link: http://lkml.kernel.org/r/20170717193955.20207-2-zi.yan@sent.com Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-09-08 23:10:42 +00:00
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
struct folio *folio;
struct queue_pages *qp = walk->private;
if (unlikely(is_pmd_migration_entry(*pmd))) {
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
qp->nr_failed++;
return;
}
folio = pfn_folio(pmd_pfn(*pmd));
if (is_huge_zero_page(&folio->page)) {
walk->action = ACTION_CONTINUE;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return;
}
if (!queue_folio_required(folio, qp))
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return;
if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(walk->vma) ||
!migrate_folio_add(folio, qp->pagelist, qp->flags))
qp->nr_failed++;
}
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* Scan through folios, checking if they satisfy the required conditions,
* moving them from LRU to local pagelist for migration if they do (or not).
mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified When both MPOL_MF_MOVE* and MPOL_MF_STRICT was specified, mbind() should try best to migrate misplaced pages, if some of the pages could not be migrated, then return -EIO. There are three different sub-cases: 1. vma is not migratable 2. vma is migratable, but there are unmovable pages 3. vma is migratable, pages are movable, but migrate_pages() fails If #1 happens, kernel would just abort immediately, then return -EIO, after a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"). If #3 happens, kernel would set policy and migrate pages with best-effort, but won't rollback the migrated pages and reset the policy back. Before that commit, they behaves in the same way. It'd better to keep their behavior consistent. But, rolling back the migrated pages and resetting the policy back sounds not feasible, so just make #1 behave as same as #3. Userspace will know that not everything was successfully migrated (via -EIO), and can take whatever steps it deems necessary - attempt rollback, determine which exact page(s) are violating the policy, etc. Make queue_pages_range() return 1 to indicate there are unmovable pages or vma is not migratable. The #2 is not handled correctly in the current kernel, the following patch will fix it. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:15 +00:00
*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* queue_folios_pte_range() has two possible return values:
* 0 - continue walking to scan for more, even if an existing folio on the
* wrong node could not be isolated and queued for migration.
* -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
* and an existing folio was on a node that does not follow the policy.
*/
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
pte_t *pte, *mapped_pte;
mm: ptep_get() conversion Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot <lkp@intel.com> Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Potapenko <glider@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Dave Airlie <airlied@gmail.com> Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: SeongJae Park <sj@kernel.org> Cc: Shakeel Butt <shakeelb@google.com> Cc: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-06-12 15:15:45 +00:00
pte_t ptent;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (ptl) {
queue_folios_pmd(pmd, walk);
spin_unlock(ptl);
goto out;
}
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
mm/pagewalkers: ACTION_AGAIN if pte_offset_map_lock() fails Simple walk_page_range() users should set ACTION_AGAIN to retry when pte_offset_map_lock() fails. No need to check pmd_trans_unstable(): that was precisely to avoid the possiblity of calling pte_offset_map() on a racily removed or inserted THP entry, but such cases are now safely handled inside it. Likewise there is no need to check pmd_none() or pmd_bad() before calling it. Link: https://lkml.kernel.org/r/c77d9d10-3aad-e3ce-4896-99e91c7947f3@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: SeongJae Park <sj@kernel.org> for mm/damon part Cc: Alistair Popple <apopple@nvidia.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Song Liu <song@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Zack Rusin <zackr@vmware.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-06-09 01:17:26 +00:00
if (!pte) {
walk->action = ACTION_AGAIN;
return 0;
}
for (; addr != end; pte++, addr += PAGE_SIZE) {
mm: ptep_get() conversion Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot <lkp@intel.com> Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Potapenko <glider@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Dave Airlie <airlied@gmail.com> Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: SeongJae Park <sj@kernel.org> Cc: Shakeel Butt <shakeelb@google.com> Cc: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-06-12 15:15:45 +00:00
ptent = ptep_get(pte);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (pte_none(ptent))
continue;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (!pte_present(ptent)) {
if (is_migration_entry(pte_to_swp_entry(ptent)))
qp->nr_failed++;
continue;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
}
mm: ptep_get() conversion Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot <lkp@intel.com> Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Potapenko <glider@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Dave Airlie <airlied@gmail.com> Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: SeongJae Park <sj@kernel.org> Cc: Shakeel Butt <shakeelb@google.com> Cc: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-06-12 15:15:45 +00:00
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
*/
if (folio_test_reserved(folio))
continue;
if (!queue_folio_required(folio, qp))
continue;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (folio_test_large(folio)) {
mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind When running syzkaller internally, we ran into the below bug on 4.9.x kernel: kernel BUG at mm/huge_memory.c:2124! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 task: ffff880067b34900 task.stack: ffff880068998000 RIP: split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 Call Trace: split_huge_page include/linux/huge_mm.h:100 [inline] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538 walk_pmd_range mm/pagewalk.c:50 [inline] walk_pud_range mm/pagewalk.c:90 [inline] walk_pgd_range mm/pagewalk.c:116 [inline] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208 walk_page_range+0x154/0x370 mm/pagewalk.c:285 queue_pages_range+0x115/0x150 mm/mempolicy.c:694 do_mbind mm/mempolicy.c:1241 [inline] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370 SyS_mbind+0x46/0x60 mm/mempolicy.c:1352 do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282 entry_SYSCALL_64_after_swapgs+0x5d/0xdb Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c RIP [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 RSP <ffff88006899f980> with the below test: uint64_t r[1] = {0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); intptr_t res = 0; res = syscall(__NR_socket, 0x11, 3, 0x300); if (res != -1) r[0] = res; *(uint32_t*)0x20000040 = 0x10000; *(uint32_t*)0x20000044 = 1; *(uint32_t*)0x20000048 = 0xc520; *(uint32_t*)0x2000004c = 1; syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10); syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0); *(uint64_t*)0x20000340 = 2; syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340, 0x45d4, 3); return 0; } Actually the test does: mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 socket(AF_PACKET, SOCK_RAW, 768) = 3 setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0 mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000 mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0 The setsockopt() would allocate compound pages (16 pages in this test) for packet tx ring, then the mmap() would call packet_mmap() to map the pages into the user address space specified by the mmap() call. When calling mbind(), it would scan the vma to queue the pages for migration to the new node. It would split any huge page since 4.9 doesn't support THP migration, however, the packet tx ring compound pages are not THP and even not movable. So, the above bug is triggered. However, the later kernel is not hit by this issue due to commit d44d363f6578 ("mm: don't assume anonymous pages have SwapBacked flag"), which just removes the PageSwapBacked check for a different reason. But, there is a deeper issue. According to the semantic of mbind(), it should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and MPOL_MF_STRICT was also specified, but the kernel was unable to move all existing pages in the range. The tx ring of the packet socket is definitely not movable, however, mbind() returns success for this case. Although the most socket file associates with non-movable pages, but XDP may have movable pages from gup. So, it sounds not fine to just check the underlying file type of vma in vma_migratable(). Change migrate_page_add() to check if the page is movable or not, if it is unmovable, just return -EIO. But do not abort pte walk immediately, since there may be pages off LRU temporarily. We should migrate other pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some paged could not be not moved, then return -EIO for mbind() eventually. With this change the above test would return -EIO as expected. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-3-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:18 +00:00
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* A large folio can only be isolated from LRU once,
* but may be mapped by many PTEs (and Copy-On-Write may
* intersperse PTEs of other, order 0, folios). This is
* a common case, so don't mistake it for failure (but
* there can be other cases of multi-mapped pages which
* this quick check does not help to filter out - and a
* search of the pagelist might grow to be prohibitive).
*
* migrate_pages(&pagelist) returns nr_failed folios, so
* check "large" now so that queue_pages_range() returns
* a comparable nr_failed folios. This does imply that
* if folio could not be isolated for some racy reason
* at its first PTE, later PTEs will not give it another
* chance of isolation; but keeps the accounting simple.
mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind When running syzkaller internally, we ran into the below bug on 4.9.x kernel: kernel BUG at mm/huge_memory.c:2124! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 task: ffff880067b34900 task.stack: ffff880068998000 RIP: split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 Call Trace: split_huge_page include/linux/huge_mm.h:100 [inline] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538 walk_pmd_range mm/pagewalk.c:50 [inline] walk_pud_range mm/pagewalk.c:90 [inline] walk_pgd_range mm/pagewalk.c:116 [inline] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208 walk_page_range+0x154/0x370 mm/pagewalk.c:285 queue_pages_range+0x115/0x150 mm/mempolicy.c:694 do_mbind mm/mempolicy.c:1241 [inline] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370 SyS_mbind+0x46/0x60 mm/mempolicy.c:1352 do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282 entry_SYSCALL_64_after_swapgs+0x5d/0xdb Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c RIP [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 RSP <ffff88006899f980> with the below test: uint64_t r[1] = {0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); intptr_t res = 0; res = syscall(__NR_socket, 0x11, 3, 0x300); if (res != -1) r[0] = res; *(uint32_t*)0x20000040 = 0x10000; *(uint32_t*)0x20000044 = 1; *(uint32_t*)0x20000048 = 0xc520; *(uint32_t*)0x2000004c = 1; syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10); syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0); *(uint64_t*)0x20000340 = 2; syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340, 0x45d4, 3); return 0; } Actually the test does: mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 socket(AF_PACKET, SOCK_RAW, 768) = 3 setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0 mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000 mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0 The setsockopt() would allocate compound pages (16 pages in this test) for packet tx ring, then the mmap() would call packet_mmap() to map the pages into the user address space specified by the mmap() call. When calling mbind(), it would scan the vma to queue the pages for migration to the new node. It would split any huge page since 4.9 doesn't support THP migration, however, the packet tx ring compound pages are not THP and even not movable. So, the above bug is triggered. However, the later kernel is not hit by this issue due to commit d44d363f6578 ("mm: don't assume anonymous pages have SwapBacked flag"), which just removes the PageSwapBacked check for a different reason. But, there is a deeper issue. According to the semantic of mbind(), it should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and MPOL_MF_STRICT was also specified, but the kernel was unable to move all existing pages in the range. The tx ring of the packet socket is definitely not movable, however, mbind() returns success for this case. Although the most socket file associates with non-movable pages, but XDP may have movable pages from gup. So, it sounds not fine to just check the underlying file type of vma in vma_migratable(). Change migrate_page_add() to check if the page is movable or not, if it is unmovable, just return -EIO. But do not abort pte walk immediately, since there may be pages off LRU temporarily. We should migrate other pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some paged could not be not moved, then return -EIO for mbind() eventually. With this change the above test would return -EIO as expected. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-3-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:18 +00:00
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (folio == qp->large)
continue;
qp->large = folio;
}
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
qp->nr_failed++;
if (strictly_unmovable(flags))
break;
}
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
out:
if (qp->nr_failed && strictly_unmovable(flags))
return -EIO;
return 0;
}
static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
unsigned long flags = qp->flags;
struct folio *folio;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (!pte_present(entry)) {
if (unlikely(is_hugetlb_entry_migration(entry)))
qp->nr_failed++;
goto unlock;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
}
folio = pfn_folio(pte_pfn(entry));
if (!queue_folio_required(folio, qp))
goto unlock;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(walk->vma)) {
qp->nr_failed++;
mm/mempolicy: support MPOL_MF_STRICT for huge page mapping MPOL_MF_STRICT is used in mbind() for purposes: (1) MPOL_MF_STRICT is set alone without MPOL_MF_MOVE or MPOL_MF_MOVE_ALL, to check if there is misplaced page and return -EIO; (2) MPOL_MF_STRICT is set with MPOL_MF_MOVE or MPOL_MF_MOVE_ALL, to check if there is misplaced page which is failed to isolate, or page is success on isolate but failed to move, and return -EIO. For non hugepage mapping, (1) and (2) are implemented as expectation. For hugepage mapping, (1) is not implemented. And in (2), the part about failed to isolate and report -EIO is not implemented. This patch implements the missed parts for hugepage mapping. Benefits with it applied: - User space can apply same code logic to handle mbind() on hugepage and non hugepage mapping; - Reliably using MPOL_MF_STRICT alone to check whether there is misplaced page or not when bind policy on address range, especially for address range which contains both hugepage and non hugepage mapping. Analysis of potential impact to existing users: - If MPOL_MF_STRICT alone was previously used, hugetlb pages not following the memory policy would not cause an EIO error. After this change, hugetlb pages are treated like all other pages. If MPOL_MF_STRICT alone is used and hugetlb pages do not follow memory policy an EIO error will be returned. - For users who using MPOL_MF_STRICT with MPOL_MF_MOVE or MPOL_MF_MOVE_ALL, the semantic about some pages could not be moved will not be changed by this patch, because failed to isolate and failed to move have same effects to users, so their existing code will not be impacted. In mbind man page, the note about 'MPOL_MF_STRICT is ignored on huge page mappings' can be removed after this patch is applied. Mike: : The current behavior with MPOL_MF_STRICT and hugetlb pages is inconsistent : and does not match documentation (as described above). The special : behavior for hugetlb pages ideally should have been removed when hugetlb : page migration was introduced. It is unlikely that anyone relies on : today's inconsistent behavior, and removing one more case of special : handling for hugetlb pages is a good thing. Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: linux-man <linux-man@vger.kernel.org> Link: http://lkml.kernel.org/r/1581559627-6206-1-git-send-email-lixinhai.lxh@gmail.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 04:10:48 +00:00
goto unlock;
}
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
* To check if the folio is shared, ideally we want to make sure
* every page is mapped to the same process. Doing that is very
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* expensive, so check the estimated sharers of the folio instead.
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if ((flags & MPOL_MF_MOVE_ALL) ||
(folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
if (!isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
spin_unlock(ptl);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (qp->nr_failed && strictly_unmovable(flags))
return -EIO;
#endif
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return 0;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit <namit@vmware.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yu Zhao <yuzhao@google.com> Cc: Nick Piggin <npiggin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-10 01:20:50 +00:00
struct mmu_gather tlb;
long nr_updated;
mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit <namit@vmware.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yu Zhao <yuzhao@google.com> Cc: Nick Piggin <npiggin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-10 01:20:50 +00:00
tlb_gather_mmu(&tlb, vma->vm_mm);
nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
if (nr_updated > 0)
mm: numa: Add pte updates, hinting and migration stats It is tricky to quantify the basic cost of automatic NUMA placement in a meaningful manner. This patch adds some vmstats that can be used as part of a basic costing model. u = basic unit = sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cpte = Cost PTE access = Ca Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock) where Cpte is incurred twice for a read and a write and Wlock is a constant representing the cost of taking or releasing a lock Cnumahint = Cost of a minor page fault = some high constant e.g. 1000 Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u Ci = Cost of page isolation = Ca + Wi where Wi is a constant that should reflect the approximate cost of the locking operation Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma) where Wnuma is the approximate NUMA factor. 1 is local. 1.2 would imply that remote accesses are 20% more expensive Balancing cost = Cpte * numa_pte_updates + Cnumahint * numa_hint_faults + Ci * numa_pages_migrated + Cpagecopy * numa_pages_migrated Note that numa_pages_migrated is used as a measure of how many pages were isolated even though it would miss pages that failed to migrate. A vmstat counter could have been added for it but the isolation cost is pretty marginal in comparison to the overall cost so it seemed overkill. The ideal way to measure automatic placement benefit would be to count the number of remote accesses versus local accesses and do something like benefit = (remote_accesses_before - remove_access_after) * Wnuma but the information is not readily available. As a workload converges, the expection would be that the number of remote numa hints would reduce to 0. convergence = numa_hint_faults_local / numa_hint_faults where this is measured for the last N number of numa hints recorded. When the workload is fully converged the value is 1. This can measure if the placement policy is converging and how fast it is doing it. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com>
2012-11-02 14:52:48 +00:00
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
mm/mprotect: use mmu_gather Patch series "mm/mprotect: avoid unnecessary TLB flushes", v6. This patchset is intended to remove unnecessary TLB flushes during mprotect() syscalls. Once this patch-set make it through, similar and further optimizations for MADV_COLD and userfaultfd would be possible. Basically, there are 3 optimizations in this patch-set: 1. Use TLB batching infrastructure to batch flushes across VMAs and do better/fewer flushes. This would also be handy for later userfaultfd enhancements. 2. Avoid unnecessary TLB flushes. This optimization is the one that provides most of the performance benefits. Unlike previous versions, we now only avoid flushes that would not result in spurious page-faults. 3. Avoiding TLB flushes on change_huge_pmd() that are only needed to prevent the A/D bits from changing. Andrew asked for some benchmark numbers. I do not have an easy determinate macrobenchmark in which it is easy to show benefit. I therefore ran a microbenchmark: a loop that does the following on anonymous memory, just as a sanity check to see that time is saved by avoiding TLB flushes. The loop goes: mprotect(p, PAGE_SIZE, PROT_READ) mprotect(p, PAGE_SIZE, PROT_READ|PROT_WRITE) *p = 0; // make the page writable The test was run in KVM guest with 1 or 2 threads (the second thread was busy-looping). I measured the time (cycles) of each operation: 1 thread 2 threads mmots +patch mmots +patch PROT_READ 3494 2725 (-22%) 8630 7788 (-10%) PROT_READ|WRITE 3952 2724 (-31%) 9075 2865 (-68%) [ mmots = v5.17-rc6-mmots-2022-03-06-20-38 ] The exact numbers are really meaningless, but the benefit is clear. There are 2 interesting results though. (1) PROT_READ is cheaper, while one can expect it not to be affected. This is presumably due to TLB miss that is saved (2) Without memory access (*p = 0), the speedup of the patch is even greater. In that scenario mprotect(PROT_READ) also avoids the TLB flush. As a result both operations on the patched kernel take roughly ~1500 cycles (with either 1 or 2 threads), whereas on mmotm their cost is as high as presented in the table. This patch (of 3): change_pXX_range() currently does not use mmu_gather, but instead implements its own deferred TLB flushes scheme. This both complicates the code, as developers need to be aware of different invalidation schemes, and prevents opportunities to avoid TLB flushes or perform them in finer granularity. The use of mmu_gather for modified PTEs has benefits in various scenarios even if pages are not released. For instance, if only a single page needs to be flushed out of a range of many pages, only that page would be flushed. If a THP page is flushed, on x86 a single TLB invlpg instruction can be used instead of 512 instructions (or a full TLB flush, which would Linux would actually use by default). mprotect() over multiple VMAs requires a single flush. Use mmu_gather in change_pXX_range(). As the pages are not released, only record the flushed range using tlb_flush_pXX_range(). Handle THP similarly and get rid of flush_cache_range() which becomes redundant since tlb_start_vma() calls it when needed. Link: https://lkml.kernel.org/r/20220401180821.1986781-1-namit@vmware.com Link: https://lkml.kernel.org/r/20220401180821.1986781-2-namit@vmware.com Signed-off-by: Nadav Amit <namit@vmware.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Cooper <andrew.cooper3@citrix.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will@kernel.org> Cc: Yu Zhao <yuzhao@google.com> Cc: Nick Piggin <npiggin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-05-10 01:20:50 +00:00
tlb_finish_mmu(&tlb);
return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
mm/mempolicy.c: check range first in queue_pages_test_walk Patch series "mm: Fix checking unmapped holes for mbind", v4. This patchset fix checking unmapped holes for mbind(). First patch makes sure the vma been correctly tracked in .test_walk(), so each time when .test_walk() is called, the neighborhood of two vma is correct. Current problem is that the !vma_migratable() check could cause return immediately without update tracking to vma. Second patch fix the inconsistent report of EFAULT when mbind() is called for MPOL_DEFAULT and non MPOL_DEFAULT cases, so application do not need to have workaround code to handle this special behavior. Currently there are two problems, one is that the .test_walk() can not know there is hole at tail side of range, because .test_walk() only call for vma not for hole. The other one is that mbind_range() checks for hole at head side of range but do not consider the MPOL_MF_DISCONTIG_OK flag as done in .test_walk(). This patch (of 2): Checking unmapped hole and updating the previous vma must be handled first, otherwise the unmapped hole could be calculated from a wrong previous vma. Several commits were relevant to this error: - commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") This commit was correct, the VM_PFNMAP check was after updating previous vma - commit 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)") This commit added VM_PFNMAP check before updating previous vma. Then, there were two VM_PFNMAP check did same thing twice. - commit acda0c334028 ("mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_page s_range()") This commit tried to fix the duplicated VM_PFNMAP check, but it wrongly removed the one which was after updating vma. Link: http://lkml.kernel.org/r/1573218104-11021-2-git-send-email-lixinhai.lxh@gmail.com Fixes: acda0c334028 (mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_pages_range()) Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:15 +00:00
/* range check first */
VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
if (!qp->first) {
qp->first = vma;
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
(qp->start < vma->vm_start))
/* hole at head side of range */
mm/mempolicy.c: check range first in queue_pages_test_walk Patch series "mm: Fix checking unmapped holes for mbind", v4. This patchset fix checking unmapped holes for mbind(). First patch makes sure the vma been correctly tracked in .test_walk(), so each time when .test_walk() is called, the neighborhood of two vma is correct. Current problem is that the !vma_migratable() check could cause return immediately without update tracking to vma. Second patch fix the inconsistent report of EFAULT when mbind() is called for MPOL_DEFAULT and non MPOL_DEFAULT cases, so application do not need to have workaround code to handle this special behavior. Currently there are two problems, one is that the .test_walk() can not know there is hole at tail side of range, because .test_walk() only call for vma not for hole. The other one is that mbind_range() checks for hole at head side of range but do not consider the MPOL_MF_DISCONTIG_OK flag as done in .test_walk(). This patch (of 2): Checking unmapped hole and updating the previous vma must be handled first, otherwise the unmapped hole could be calculated from a wrong previous vma. Several commits were relevant to this error: - commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") This commit was correct, the VM_PFNMAP check was after updating previous vma - commit 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)") This commit added VM_PFNMAP check before updating previous vma. Then, there were two VM_PFNMAP check did same thing twice. - commit acda0c334028 ("mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_page s_range()") This commit tried to fix the duplicated VM_PFNMAP check, but it wrongly removed the one which was after updating vma. Link: http://lkml.kernel.org/r/1573218104-11021-2-git-send-email-lixinhai.lxh@gmail.com Fixes: acda0c334028 (mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_pages_range()) Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:15 +00:00
return -EFAULT;
}
next = find_vma(vma->vm_mm, vma->vm_end);
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
(!next || vma->vm_end < next->vm_start)))
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
/* hole at middle or tail of range */
return -EFAULT;
mm/mempolicy.c: check range first in queue_pages_test_walk Patch series "mm: Fix checking unmapped holes for mbind", v4. This patchset fix checking unmapped holes for mbind(). First patch makes sure the vma been correctly tracked in .test_walk(), so each time when .test_walk() is called, the neighborhood of two vma is correct. Current problem is that the !vma_migratable() check could cause return immediately without update tracking to vma. Second patch fix the inconsistent report of EFAULT when mbind() is called for MPOL_DEFAULT and non MPOL_DEFAULT cases, so application do not need to have workaround code to handle this special behavior. Currently there are two problems, one is that the .test_walk() can not know there is hole at tail side of range, because .test_walk() only call for vma not for hole. The other one is that mbind_range() checks for hole at head side of range but do not consider the MPOL_MF_DISCONTIG_OK flag as done in .test_walk(). This patch (of 2): Checking unmapped hole and updating the previous vma must be handled first, otherwise the unmapped hole could be calculated from a wrong previous vma. Several commits were relevant to this error: - commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") This commit was correct, the VM_PFNMAP check was after updating previous vma - commit 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)") This commit added VM_PFNMAP check before updating previous vma. Then, there were two VM_PFNMAP check did same thing twice. - commit acda0c334028 ("mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_page s_range()") This commit tried to fix the duplicated VM_PFNMAP check, but it wrongly removed the one which was after updating vma. Link: http://lkml.kernel.org/r/1573218104-11021-2-git-send-email-lixinhai.lxh@gmail.com Fixes: acda0c334028 (mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_pages_range()) Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:15 +00:00
mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified When MPOL_MF_STRICT was specified and an existing page was already on a node that does not follow the policy, mbind() should return -EIO. But commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") broke the rule. And commit c8633798497c ("mm: mempolicy: mbind and migrate_pages support thp migration") didn't return the correct value for THP mbind() too. If MPOL_MF_STRICT is set, ignore vma_migratable() to make sure it reaches queue_pages_to_pte_range() or queue_pages_pmd() to check if an existing page was already on a node that does not follow the policy. And, non-migratable vma may be used, return -EIO too if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified. Tested with https://github.com/metan-ucw/ltp/blob/master/testcases/kernel/syscalls/mbind/mbind02.c [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/1553020556-38583-1-git-send-email-yang.shi@linux.alibaba.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Oscar Salvador <osalvador@suse.de> Reported-by: Cyril Hrubis <chrubis@suse.cz> Suggested-by: Kirill A. Shutemov <kirill@shutemov.name> Acked-by: Rafael Aquini <aquini@redhat.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Acked-by: David Rientjes <rientjes@google.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-29 03:43:55 +00:00
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
*/
if (!vma_migratable(vma) &&
!(flags & MPOL_MF_STRICT))
return 1;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
/*
* Check page nodes, and queue pages to move, in the current vma.
* But if no moving, and no strict checking, the scan can be skipped.
*/
if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_folios_hugetlb,
.pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
mm: enable page walking API to lock vmas during the walk walk_page_range() and friends often operate under write-locked mmap_lock. With introduction of vma locks, the vmas have to be locked as well during such walks to prevent concurrent page faults in these areas. Add an additional member to mm_walk_ops to indicate locking requirements for the walk. The change ensures that page walks which prevent concurrent page faults by write-locking mmap_lock, operate correctly after introduction of per-vma locks. With per-vma locks page faults can be handled under vma lock without taking mmap_lock at all, so write locking mmap_lock would not stop them. The change ensures vmas are properly locked during such walks. A sample issue this solves is do_mbind() performing queue_pages_range() to queue pages for migration. Without this change a concurrent page can be faulted into the area and be left out of migration. Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org> Suggested-by: Jann Horn <jannh@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michel Lespinasse <michel@lespinasse.org> Cc: Peter Xu <peterx@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-04 15:27:19 +00:00
.walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
.hugetlb_entry = queue_folios_hugetlb,
.pmd_entry = queue_folios_pte_range,
.test_walk = queue_pages_test_walk,
.walk_lock = PGWALK_WRLOCK,
};
/*
* Walk through page tables and collect pages to be migrated.
*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* If pages found in a given range are not on the required set of @nodes,
* and migration is allowed, they are isolated and queued to @pagelist.
mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified When both MPOL_MF_MOVE* and MPOL_MF_STRICT was specified, mbind() should try best to migrate misplaced pages, if some of the pages could not be migrated, then return -EIO. There are three different sub-cases: 1. vma is not migratable 2. vma is migratable, but there are unmovable pages 3. vma is migratable, pages are movable, but migrate_pages() fails If #1 happens, kernel would just abort immediately, then return -EIO, after a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"). If #3 happens, kernel would set policy and migrate pages with best-effort, but won't rollback the migrated pages and reset the policy back. Before that commit, they behaves in the same way. It'd better to keep their behavior consistent. But, rolling back the migrated pages and resetting the policy back sounds not feasible, so just make #1 behave as same as #3. Userspace will know that not everything was successfully migrated (via -EIO), and can take whatever steps it deems necessary - attempt rollback, determine which exact page(s) are violating the policy, etc. Make queue_pages_range() return 1 to indicate there are unmovable pages or vma is not migratable. The #2 is not handled correctly in the current kernel, the following patch will fix it. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:15 +00:00
*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* queue_pages_range() may return:
* 0 - all pages already on the right node, or successfully queued for moving
* (or neither strict checking nor moving requested: only range checking).
* >0 - this number of misplaced folios could not be queued for moving
* (a hugetlbfs page or a transparent huge page being counted as 1).
* -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
* -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
struct list_head *pagelist)
{
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
.start = start,
.end = end,
.first = NULL,
};
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
mm: enable page walking API to lock vmas during the walk walk_page_range() and friends often operate under write-locked mmap_lock. With introduction of vma locks, the vmas have to be locked as well during such walks to prevent concurrent page faults in these areas. Add an additional member to mm_walk_ops to indicate locking requirements for the walk. The change ensures that page walks which prevent concurrent page faults by write-locking mmap_lock, operate correctly after introduction of per-vma locks. With per-vma locks page faults can be handled under vma lock without taking mmap_lock at all, so write locking mmap_lock would not stop them. The change ensures vmas are properly locked during such walks. A sample issue this solves is do_mbind() performing queue_pages_range() to queue pages for migration. Without this change a concurrent page can be faulted into the area and be left out of migration. Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org> Suggested-by: Jann Horn <jannh@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michel Lespinasse <michel@lespinasse.org> Cc: Peter Xu <peterx@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-04 15:27:19 +00:00
&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
mm: enable page walking API to lock vmas during the walk walk_page_range() and friends often operate under write-locked mmap_lock. With introduction of vma locks, the vmas have to be locked as well during such walks to prevent concurrent page faults in these areas. Add an additional member to mm_walk_ops to indicate locking requirements for the walk. The change ensures that page walks which prevent concurrent page faults by write-locking mmap_lock, operate correctly after introduction of per-vma locks. With per-vma locks page faults can be handled under vma lock without taking mmap_lock at all, so write locking mmap_lock would not stop them. The change ensures vmas are properly locked during such walks. A sample issue this solves is do_mbind() performing queue_pages_range() to queue pages for migration. Without this change a concurrent page can be faulted into the area and be left out of migration. Link: https://lkml.kernel.org/r/20230804152724.3090321-2-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org> Suggested-by: Jann Horn <jannh@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michel Lespinasse <michel@lespinasse.org> Cc: Peter Xu <peterx@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-04 15:27:19 +00:00
err = walk_page_range(mm, start, end, ops, &qp);
mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Hugh Dickins <hughd@google.com> Cc: linux-man <linux-man@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-12-01 01:56:18 +00:00
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return err ? : qp.nr_failed;
}
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
/*
* Apply policy to a single VMA
* This must be called with the mmap_lock held for writing.
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
*/
static int vma_replace_policy(struct vm_area_struct *vma,
2023-10-03 09:20:14 +00:00
struct mempolicy *pol)
{
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
int err;
struct mempolicy *old;
struct mempolicy *new;
vma_assert_write_locked(vma);
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
if (err)
goto err_out;
}
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
old = vma->vm_policy;
vma->vm_policy = new; /* protected by mmap_lock */
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
mpol_put(old);
return 0;
err_out:
mpol_put(new);
return err;
}
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
unsigned long vmstart, vmend;
mm: fix mbind vma merge problem Strangely, current mbind() doesn't merge vma with neighbor vma although it's possible. Unfortunately, many vma can reduce performance... This patch fixes it. reproduced program ---------------------------------------------------------------- #include <numaif.h> #include <numa.h> #include <sys/mman.h> #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <string.h> static unsigned long pagesize; int main(int argc, char** argv) { void* addr; int ch; int node; struct bitmask *nmask = numa_allocate_nodemask(); int err; int node_set = 0; char buf[128]; while ((ch = getopt(argc, argv, "n:")) != -1){ switch (ch){ case 'n': node = strtol(optarg, NULL, 0); numa_bitmask_setbit(nmask, node); node_set = 1; break; default: ; } } argc -= optind; argv += optind; if (!node_set) numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0); if (addr == MAP_FAILED) perror("mmap "), exit(1); fprintf(stderr, "pid = %d \n" "addr = %p\n", getpid(), addr); /* make page populate */ memset(addr, 0, pagesize*3); /* first mbind */ err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp, nmask->size, MPOL_MF_MOVE_ALL); if (err) error("mbind1 "); /* second mbind */ err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0); if (err) error("mbind2 "); sprintf(buf, "cat /proc/%d/maps", getpid()); system(buf); return 0; } ---------------------------------------------------------------- result without this patch addr = 0x7fe26ef09000 [snip] 7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0 7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0 7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0 7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0 => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas. result with this patch addr = 0x7fc9ebc76000 [snip] 7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0 7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack] => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma. [minchan.kim@gmail.com: fix file offset passed to vma_merge()] Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 21:41:57 +00:00
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
vmend = min(end, vma->vm_end);
if (start > vma->vm_start) {
*prev = vma;
vmstart = start;
} else {
vmstart = vma->vm_start;
}
2023-10-03 09:20:14 +00:00
if (mpol_equal(vma->vm_policy, new_pol)) {
mm/mempolicy: correctly update prev when policy is equal on mbind The refactoring in commit f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator") introduces a subtle bug which arises when attempting to apply a new NUMA policy across a range of VMAs in mbind_range(). The refactoring passes a **prev pointer to keep track of the previous VMA in order to reduce duplication, and in all but one case it keeps this correctly updated. The bug arises when a VMA within the specified range has an equivalent policy as determined by mpol_equal() - which unlike other cases, does not update prev. This can result in a situation where, later in the iteration, a VMA is found whose policy does need to change. At this point, vma_merge() is invoked with prev pointing to a VMA which is before the previous VMA. Since vma_merge() discovers the curr VMA by looking for the one immediately after prev, it will now be in a situation where this VMA is incorrect and the merge will not proceed correctly. This is checked in the VM_WARN_ON() invariant case with end > curr->vm_end, which, if a merge is possible, results in a warning (if CONFIG_DEBUG_VM is specified). I note that vma_merge() performs these invariant checks only after merge_prev/merge_next are checked, which is debatable as it hides this issue if no merge is possible even though a buggy situation has arisen. The solution is simply to update the prev pointer even when policies are equal. This caused a bug to arise in the 6.2.y stable tree, and this patch resolves this bug. Link: https://lkml.kernel.org/r/83f1d612acb519d777bebf7f3359317c4e7f4265.1682866629.git.lstoakes@gmail.com Fixes: f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator") Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com> Reported-by: kernel test robot <oliver.sang@intel.com> Link: https://lore.kernel.org/oe-lkp/202304292203.44ddeff6-oliver.sang@intel.com Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-30 15:07:07 +00:00
*prev = vma;
return 0;
mm/mempolicy: correctly update prev when policy is equal on mbind The refactoring in commit f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator") introduces a subtle bug which arises when attempting to apply a new NUMA policy across a range of VMAs in mbind_range(). The refactoring passes a **prev pointer to keep track of the previous VMA in order to reduce duplication, and in all but one case it keeps this correctly updated. The bug arises when a VMA within the specified range has an equivalent policy as determined by mpol_equal() - which unlike other cases, does not update prev. This can result in a situation where, later in the iteration, a VMA is found whose policy does need to change. At this point, vma_merge() is invoked with prev pointing to a VMA which is before the previous VMA. Since vma_merge() discovers the curr VMA by looking for the one immediately after prev, it will now be in a situation where this VMA is incorrect and the merge will not proceed correctly. This is checked in the VM_WARN_ON() invariant case with end > curr->vm_end, which, if a merge is possible, results in a warning (if CONFIG_DEBUG_VM is specified). I note that vma_merge() performs these invariant checks only after merge_prev/merge_next are checked, which is debatable as it hides this issue if no merge is possible even though a buggy situation has arisen. The solution is simply to update the prev pointer even when policies are equal. This caused a bug to arise in the 6.2.y stable tree, and this patch resolves this bug. Link: https://lkml.kernel.org/r/83f1d612acb519d777bebf7f3359317c4e7f4265.1682866629.git.lstoakes@gmail.com Fixes: f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator") Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com> Reported-by: kernel test robot <oliver.sang@intel.com> Link: https://lore.kernel.org/oe-lkp/202304292203.44ddeff6-oliver.sang@intel.com Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-30 15:07:07 +00:00
}
mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al. mprotect() and other functions which change VMA parameters over a range each employ a pattern of:- 1. Attempt to merge the range with adjacent VMAs. 2. If this fails, and the range spans a subset of the VMA, split it accordingly. This is open-coded and duplicated in each case. Also in each case most of the parameters passed to vma_merge() remain the same. Create a new function, vma_modify(), which abstracts this operation, accepting only those parameters which can be changed. To avoid the mess of invoking each function call with unnecessary parameters, create inline wrapper functions for each of the modify operations, parameterised only by what is required to perform the action. We can also significantly simplify the logic - by returning the VMA if we split (or merged VMA if we do not) we no longer need specific handling for merge/split cases in any of the call sites. Note that the userfaultfd_release() case works even though it does not split VMAs - since start is set to vma->vm_start and end is set to vma->vm_end, the split logic does not trigger. In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start - vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this instance, this invocation will remain unchanged. We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that vma_merge() correctly ensures that flags remain the same, something that is already checked in is_mergeable_vma() and elsewhere, and in any case is not specific to mprotect(). Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Christian Brauner <brauner@kernel.org> Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-11 17:04:28 +00:00
vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
if (IS_ERR(vma))
return PTR_ERR(vma);
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
*prev = vma;
return vma_replace_policy(vma, new_pol);
}
/* Set the process memory policy */
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
struct mempolicy *new, *old;
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
NODEMASK_SCRATCH(scratch);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
int ret;
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
if (!scratch)
return -ENOMEM;
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
mm/mempolicy: fix lock contention on mems_allowed The mems_allowed field can be modified by other tasks, so it isn't safe to access it with alloc_lock unlocked even in the current process context. Say there are two tasks: A from cpusetA is performing set_mempolicy(2), and B is changing cpusetA's cpuset.mems: A (set_mempolicy) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A new = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, new); task_unlock(t); } } } } task_lock(A); A->mempolicy = pol; task_unlock(A); In this case A's pol->nodes is computed by old mems_allowed, and could be inconsistent with A's new mems_allowed. While it is different when replacing vmas' policy: the pol->nodes is gone wild only when current_cpuset_is_being_rebound(): A (mbind) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); mmap_write_lock(A->mm); cpuset_being_rebound = cpusetA; update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A mask = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, mask); task_unlock(t); } } foreach v in A->mm { if (cpuset_being_rebound == cpusetA) pol.rebind(pol, cpuset.mems); v->vma_policy = pol; } mmap_write_unlock(A->mm); mmap_write_lock(t->mm); mpol_rebind_mm(t->mm); mmap_write_unlock(t->mm); } } cpuset_being_rebound = NULL; In this case, the cpuset.mems, which has already done updating, is finally used for calculating pol->nodes, rather than A->mems_allowed. So it is OK to call mpol_set_nodemask() with alloc_lock unlocked when doing mbind(2). Link: https://lkml.kernel.org/r/20220811124157.74888-1-wuyun.abel@bytedance.com Fixes: 78b132e9bae9 ("mm/mempolicy: remove or narrow the lock on current") Signed-off-by: Abel Wu <wuyun.abel@bytedance.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Wei Yang <richard.weiyang@gmail.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-11 12:41:57 +00:00
task_lock(current);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
ret = mpol_set_nodemask(new, nodes, scratch);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
if (ret) {
mm/mempolicy: fix lock contention on mems_allowed The mems_allowed field can be modified by other tasks, so it isn't safe to access it with alloc_lock unlocked even in the current process context. Say there are two tasks: A from cpusetA is performing set_mempolicy(2), and B is changing cpusetA's cpuset.mems: A (set_mempolicy) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A new = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, new); task_unlock(t); } } } } task_lock(A); A->mempolicy = pol; task_unlock(A); In this case A's pol->nodes is computed by old mems_allowed, and could be inconsistent with A's new mems_allowed. While it is different when replacing vmas' policy: the pol->nodes is gone wild only when current_cpuset_is_being_rebound(): A (mbind) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); mmap_write_lock(A->mm); cpuset_being_rebound = cpusetA; update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A mask = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, mask); task_unlock(t); } } foreach v in A->mm { if (cpuset_being_rebound == cpusetA) pol.rebind(pol, cpuset.mems); v->vma_policy = pol; } mmap_write_unlock(A->mm); mmap_write_lock(t->mm); mpol_rebind_mm(t->mm); mmap_write_unlock(t->mm); } } cpuset_being_rebound = NULL; In this case, the cpuset.mems, which has already done updating, is finally used for calculating pol->nodes, rather than A->mems_allowed. So it is OK to call mpol_set_nodemask() with alloc_lock unlocked when doing mbind(2). Link: https://lkml.kernel.org/r/20220811124157.74888-1-wuyun.abel@bytedance.com Fixes: 78b132e9bae9 ("mm/mempolicy: remove or narrow the lock on current") Signed-off-by: Abel Wu <wuyun.abel@bytedance.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Wei Yang <richard.weiyang@gmail.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-11 12:41:57 +00:00
task_unlock(current);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
mpol_put(new);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
goto out;
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
}
mm/mempolicy: fix lock contention on mems_allowed The mems_allowed field can be modified by other tasks, so it isn't safe to access it with alloc_lock unlocked even in the current process context. Say there are two tasks: A from cpusetA is performing set_mempolicy(2), and B is changing cpusetA's cpuset.mems: A (set_mempolicy) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A new = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, new); task_unlock(t); } } } } task_lock(A); A->mempolicy = pol; task_unlock(A); In this case A's pol->nodes is computed by old mems_allowed, and could be inconsistent with A's new mems_allowed. While it is different when replacing vmas' policy: the pol->nodes is gone wild only when current_cpuset_is_being_rebound(): A (mbind) B (echo xx > cpuset.mems) ------------------------------------------------------- pol = mpol_new(); mmap_write_lock(A->mm); cpuset_being_rebound = cpusetA; update_tasks_nodemask(cpusetA) { foreach t in cpusetA { cpuset_change_task_nodemask(t) { mpol_set_nodemask(pol) { task_lock(t); // t could be A mask = f(A->mems_allowed); update t->mems_allowed; pol.create(pol, mask); task_unlock(t); } } foreach v in A->mm { if (cpuset_being_rebound == cpusetA) pol.rebind(pol, cpuset.mems); v->vma_policy = pol; } mmap_write_unlock(A->mm); mmap_write_lock(t->mm); mpol_rebind_mm(t->mm); mmap_write_unlock(t->mm); } } cpuset_being_rebound = NULL; In this case, the cpuset.mems, which has already done updating, is finally used for calculating pol->nodes, rather than A->mems_allowed. So it is OK to call mpol_set_nodemask() with alloc_lock unlocked when doing mbind(2). Link: https://lkml.kernel.org/r/20220811124157.74888-1-wuyun.abel@bytedance.com Fixes: 78b132e9bae9 ("mm/mempolicy: remove or narrow the lock on current") Signed-off-by: Abel Wu <wuyun.abel@bytedance.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Wei Yang <richard.weiyang@gmail.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-11 12:41:57 +00:00
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
old = current->mempolicy;
current->mempolicy = new;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (new && (new->mode == MPOL_INTERLEAVE ||
new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask() The task->il_next variable stores the next allocation node id for task's MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and bind mempolicies due to changing cpuset mems. Currently it also tries to make sure that current->il_next is valid within the updated nodemask. This is bogus, because 1) we are updating potentially any task's mempolicy, not just current, and 2) we might be updating a per-vma mempolicy, not task one. The interleave_nodes() function that uses il_next can cope fine with the value not being within the currently allowed nodes, so this hasn't manifested as an actual issue. We can remove the need for updating il_next completely by changing it to il_prev and store the node id of the previous interleave allocation instead of the next id. Then interleave_nodes() can calculate the next id using the current nodemask and also store it as il_prev, except when querying the next node via do_get_mempolicy(). Link: http://lkml.kernel.org/r/20170517081140.30654-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: Christoph Lameter <cl@linux.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:39:59 +00:00
current->il_prev = MAX_NUMNODES-1;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
current->il_weight = 0;
}
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
task_unlock(current);
mpol_put(old);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
/*
* Return nodemask for policy for get_mempolicy() query
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
*
* Called with task's alloc_lock held
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
*/
2023-10-03 09:20:14 +00:00
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
nodes_clear(*nodes);
2023-10-03 09:20:14 +00:00
if (pol == &default_policy)
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
return;
2023-10-03 09:20:14 +00:00
switch (pol->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
2023-10-03 09:20:14 +00:00
*nodes = pol->nodes;
break;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
case MPOL_LOCAL:
/* return empty node mask for local allocation */
break;
default:
BUG();
}
}
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
int ret;
ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
if (ret > 0) {
ret = page_to_nid(p);
put_page(p);
}
return ret;
}
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
task_lock(current);
*nmask = cpuset_current_mems_allowed;
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pgoff_t ilx; /* ignored here */
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_read_lock(mm);
vma = vma_lookup(mm, addr);
if (!vma) {
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_read_unlock(mm);
return -EFAULT;
}
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pol = __get_vma_policy(vma, addr, &ilx);
} else if (addr)
return -EINVAL;
if (!pol)
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
/*
* Take a refcount on the mpol, because we are about to
* drop the mmap_lock, after which only "pol" remains
* valid, "vma" is stale.
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
mmap_read_unlock(mm);
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->nodes);
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
} else if (pol == current->mempolicy &&
pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
if (current->il_weight)
*policy = current->il_prev;
else
*policy = next_node_in(current->il_prev,
pol->nodes);
} else {
err = -EINVAL;
goto out;
}
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
}
err = 0;
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
}
out:
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
mpol_cond_put(pol);
if (vma)
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_read_unlock(mm);
userfaultfd: allow get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) to trigger userfaults get_mempolicy(MPOL_F_NODE|MPOL_F_ADDR) called a get_user_pages that would not be waiting for userfaults before failing and it would hit on a SIGBUS instead. Using get_user_pages_locked/unlocked instead will allow get_mempolicy to allow userfaults to resolve the fault and fill the hole, before grabbing the node id of the page. If the user calls get_mempolicy() with MPOL_F_ADDR | MPOL_F_NODE for an address inside an area managed by uffd and there is no page at that address, the page allocation from within get_mempolicy() will fail because get_user_pages() does not allow for page fault retry required for uffd; the user will get SIGBUS. With this patch, the page fault will be resolved by the uffd and the get_mempolicy() will continue normally. Background: Via code review, previously the syscall would have returned -EFAULT (vm_fault_to_errno), now it will block and wait for an userfault (if it's waken before the fault is resolved it'll still -EFAULT). This way get_mempolicy will give a chance to an "unaware" app to be compliant with userfaults. The reason this visible change is that becoming "userfault compliant" cannot regress anything: all other syscalls including read(2)/write(2) had to become "userfault compliant" long time ago (that's one of the things userfaultfd can do that PROT_NONE and trapping segfaults can't). So this is just one more syscall that become "userfault compliant" like all other major ones already were. This has been happening on virtio-bridge dpdk process which just called get_mempolicy on the guest space post live migration, but before the memory had a chance to be migrated to destination. I didn't run an strace to be able to show the -EFAULT going away, but I've the confirmation of the below debug aid information (only visible with CONFIG_DEBUG_VM=y) going away with the patch: [20116.371461] FAULT_FLAG_ALLOW_RETRY missing 0 [20116.371464] CPU: 1 PID: 13381 Comm: vhost-events Not tainted 4.17.12-200.fc28.x86_64 #1 [20116.371465] Hardware name: LENOVO 20FAS2BN0A/20FAS2BN0A, BIOS N1CET54W (1.22 ) 02/10/2017 [20116.371466] Call Trace: [20116.371473] dump_stack+0x5c/0x80 [20116.371476] handle_userfault.cold.37+0x1b/0x22 [20116.371479] ? remove_wait_queue+0x20/0x60 [20116.371481] ? poll_freewait+0x45/0xa0 [20116.371483] ? do_sys_poll+0x31c/0x520 [20116.371485] ? radix_tree_lookup_slot+0x1e/0x50 [20116.371488] shmem_getpage_gfp+0xce7/0xe50 [20116.371491] ? page_add_file_rmap+0x1a/0x2c0 [20116.371493] shmem_fault+0x78/0x1e0 [20116.371495] ? filemap_map_pages+0x3a1/0x450 [20116.371498] __do_fault+0x1f/0xc0 [20116.371500] __handle_mm_fault+0xe2e/0x12f0 [20116.371502] handle_mm_fault+0xda/0x200 [20116.371504] __get_user_pages+0x238/0x790 [20116.371506] get_user_pages+0x3e/0x50 [20116.371510] kernel_get_mempolicy+0x40b/0x700 [20116.371512] ? vfs_write+0x170/0x1a0 [20116.371515] __x64_sys_get_mempolicy+0x21/0x30 [20116.371517] do_syscall_64+0x5b/0x160 [20116.371520] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above harmless debug message (not a kernel crash, just a dump_stack()) is shown with CONFIG_DEBUG_VM=y to more quickly identify and improve kernel spots that may have to become "userfaultfd compliant" like this one (without having to run an strace and search for syscall misbehavior). Spots like the above are more closer to a kernel bug for the non-cooperative usages that Mike focuses on, than for for dpdk qemu-cooperative usages that reproduced it, but it's still nicer to get this fixed for dpdk too. The part of the patch that caused me to think is only the implementation issue of mpol_get, but it looks like it should work safe no matter the kind of mempolicy structure that is (the default static policy also starts at 1 so it'll go to 2 and back to 1 without crashing everything at 0). [rppt@linux.vnet.ibm.com: changelog addition] http://lkml.kernel.org/r/20180904073718.GA26916@rapoport-lnx Link: http://lkml.kernel.org/r/20180831214848.23676-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: Maxime Coquelin <maxime.coquelin@redhat.com> Tested-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-26 22:05:16 +00:00
if (pol_refcount)
mpol_put(pol_refcount);
return err;
}
#ifdef CONFIG_MIGRATION
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
* To check if the folio is shared, ideally we want to make sure
* every page is mapped to the same process. Doing that is very
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* expensive, so check the estimated sharers of the folio instead.
*/
if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
mm: change to return bool for folio_isolate_lru() Patch series "Change the return value for page isolation functions", v3. Now the page isolation functions did not return a boolean to indicate success or not, instead it will return a negative error when failed to isolate a page. So below code used in most places seem a boolean success/failure thing, which can confuse people whether the isolation is successful. if (folio_isolate_lru(folio)) continue; Moreover the page isolation functions only return 0 or -EBUSY, and most users did not care about the negative error except for few users, thus we can convert all page isolation functions to return a boolean value, which can remove the confusion to make code more clear. No functional changes intended in this patch series. This patch (of 4): Now the folio_isolate_lru() did not return a boolean value to indicate isolation success or not, however below code checking the return value can make people think that it was a boolean success/failure thing, which makes people easy to make mistakes (see the fix patch[1]). if (folio_isolate_lru(folio)) continue; Thus it's better to check the negative error value expilictly returned by folio_isolate_lru(), which makes code more clear per Linus's suggestion[2]. Moreover Matthew suggested we can convert the isolation functions to return a boolean[3], since most users did not care about the negative error value, and can also remove the confusing of checking return value. So this patch converts the folio_isolate_lru() to return a boolean value, which means return 'true' to indicate the folio isolation is successful, and 'false' means a failure to isolation. Meanwhile changing all users' logic of checking the isolation state. No functional changes intended. [1] https://lore.kernel.org/all/20230131063206.28820-1-Kuan-Ying.Lee@mediatek.com/T/#u [2] https://lore.kernel.org/all/CAHk-=wiBrY+O-4=2mrbVyxR+hOqfdJ=Do6xoucfJ9_5az01L4Q@mail.gmail.com/ [3] https://lore.kernel.org/all/Y+sTFqwMNAjDvxw3@casper.infradead.org/ Link: https://lkml.kernel.org/r/cover.1676424378.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/8a4e3679ed4196168efadf7ea36c038f2f7d5aa9.1676424378.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com> Reviewed-by: SeongJae Park <sj@kernel.org> Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Shakeel Butt <shakeelb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-02-15 10:39:34 +00:00
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
} else {
mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind When running syzkaller internally, we ran into the below bug on 4.9.x kernel: kernel BUG at mm/huge_memory.c:2124! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 task: ffff880067b34900 task.stack: ffff880068998000 RIP: split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 Call Trace: split_huge_page include/linux/huge_mm.h:100 [inline] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538 walk_pmd_range mm/pagewalk.c:50 [inline] walk_pud_range mm/pagewalk.c:90 [inline] walk_pgd_range mm/pagewalk.c:116 [inline] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208 walk_page_range+0x154/0x370 mm/pagewalk.c:285 queue_pages_range+0x115/0x150 mm/mempolicy.c:694 do_mbind mm/mempolicy.c:1241 [inline] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370 SyS_mbind+0x46/0x60 mm/mempolicy.c:1352 do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282 entry_SYSCALL_64_after_swapgs+0x5d/0xdb Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c RIP [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 RSP <ffff88006899f980> with the below test: uint64_t r[1] = {0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); intptr_t res = 0; res = syscall(__NR_socket, 0x11, 3, 0x300); if (res != -1) r[0] = res; *(uint32_t*)0x20000040 = 0x10000; *(uint32_t*)0x20000044 = 1; *(uint32_t*)0x20000048 = 0xc520; *(uint32_t*)0x2000004c = 1; syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10); syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0); *(uint64_t*)0x20000340 = 2; syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340, 0x45d4, 3); return 0; } Actually the test does: mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 socket(AF_PACKET, SOCK_RAW, 768) = 3 setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0 mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000 mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0 The setsockopt() would allocate compound pages (16 pages in this test) for packet tx ring, then the mmap() would call packet_mmap() to map the pages into the user address space specified by the mmap() call. When calling mbind(), it would scan the vma to queue the pages for migration to the new node. It would split any huge page since 4.9 doesn't support THP migration, however, the packet tx ring compound pages are not THP and even not movable. So, the above bug is triggered. However, the later kernel is not hit by this issue due to commit d44d363f6578 ("mm: don't assume anonymous pages have SwapBacked flag"), which just removes the PageSwapBacked check for a different reason. But, there is a deeper issue. According to the semantic of mbind(), it should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and MPOL_MF_STRICT was also specified, but the kernel was unable to move all existing pages in the range. The tx ring of the packet socket is definitely not movable, however, mbind() returns success for this case. Although the most socket file associates with non-movable pages, but XDP may have movable pages from gup. So, it sounds not fine to just check the underlying file type of vma in vma_migratable(). Change migrate_page_add() to check if the page is movable or not, if it is unmovable, just return -EIO. But do not abort pte walk immediately, since there may be pages off LRU temporarily. We should migrate other pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some paged could not be not moved, then return -EIO for mbind() eventually. With this change the above test would return -EIO as expected. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-3-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:18 +00:00
/*
* Non-movable folio may reach here. And, there may be
* temporary off LRU folios or non-LRU movable folios.
* Treat them as unmovable folios since they can't be
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* isolated, so they can't be moved at the moment.
mm: mempolicy: handle vma with unmovable pages mapped correctly in mbind When running syzkaller internally, we ran into the below bug on 4.9.x kernel: kernel BUG at mm/huge_memory.c:2124! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1518 Comm: syz-executor107 Not tainted 4.9.168+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 task: ffff880067b34900 task.stack: ffff880068998000 RIP: split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 Call Trace: split_huge_page include/linux/huge_mm.h:100 [inline] queue_pages_pte_range+0x7e1/0x1480 mm/mempolicy.c:538 walk_pmd_range mm/pagewalk.c:50 [inline] walk_pud_range mm/pagewalk.c:90 [inline] walk_pgd_range mm/pagewalk.c:116 [inline] __walk_page_range+0x44a/0xdb0 mm/pagewalk.c:208 walk_page_range+0x154/0x370 mm/pagewalk.c:285 queue_pages_range+0x115/0x150 mm/mempolicy.c:694 do_mbind mm/mempolicy.c:1241 [inline] SYSC_mbind+0x3c3/0x1030 mm/mempolicy.c:1370 SyS_mbind+0x46/0x60 mm/mempolicy.c:1352 do_syscall_64+0x1d2/0x600 arch/x86/entry/common.c:282 entry_SYSCALL_64_after_swapgs+0x5d/0xdb Code: c7 80 1c 02 00 e8 26 0a 76 01 <0f> 0b 48 c7 c7 40 46 45 84 e8 4c RIP [<ffffffff81895d6b>] split_huge_page_to_list+0x8fb/0x1030 mm/huge_memory.c:2124 RSP <ffff88006899f980> with the below test: uint64_t r[1] = {0xffffffffffffffff}; int main(void) { syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0); intptr_t res = 0; res = syscall(__NR_socket, 0x11, 3, 0x300); if (res != -1) r[0] = res; *(uint32_t*)0x20000040 = 0x10000; *(uint32_t*)0x20000044 = 1; *(uint32_t*)0x20000048 = 0xc520; *(uint32_t*)0x2000004c = 1; syscall(__NR_setsockopt, r[0], 0x107, 0xd, 0x20000040, 0x10); syscall(__NR_mmap, 0x20fed000, 0x10000, 0, 0x8811, r[0], 0); *(uint64_t*)0x20000340 = 2; syscall(__NR_mbind, 0x20ff9000, 0x4000, 0x4002, 0x20000340, 0x45d4, 3); return 0; } Actually the test does: mmap(0x20000000, 16777216, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x20000000 socket(AF_PACKET, SOCK_RAW, 768) = 3 setsockopt(3, SOL_PACKET, PACKET_TX_RING, {block_size=65536, block_nr=1, frame_size=50464, frame_nr=1}, 16) = 0 mmap(0x20fed000, 65536, PROT_NONE, MAP_SHARED|MAP_FIXED|MAP_POPULATE|MAP_DENYWRITE, 3, 0) = 0x20fed000 mbind(..., MPOL_MF_STRICT|MPOL_MF_MOVE) = 0 The setsockopt() would allocate compound pages (16 pages in this test) for packet tx ring, then the mmap() would call packet_mmap() to map the pages into the user address space specified by the mmap() call. When calling mbind(), it would scan the vma to queue the pages for migration to the new node. It would split any huge page since 4.9 doesn't support THP migration, however, the packet tx ring compound pages are not THP and even not movable. So, the above bug is triggered. However, the later kernel is not hit by this issue due to commit d44d363f6578 ("mm: don't assume anonymous pages have SwapBacked flag"), which just removes the PageSwapBacked check for a different reason. But, there is a deeper issue. According to the semantic of mbind(), it should return -EIO if MPOL_MF_MOVE or MPOL_MF_MOVE_ALL was specified and MPOL_MF_STRICT was also specified, but the kernel was unable to move all existing pages in the range. The tx ring of the packet socket is definitely not movable, however, mbind() returns success for this case. Although the most socket file associates with non-movable pages, but XDP may have movable pages from gup. So, it sounds not fine to just check the underlying file type of vma in vma_migratable(). Change migrate_page_add() to check if the page is movable or not, if it is unmovable, just return -EIO. But do not abort pte walk immediately, since there may be pages off LRU temporarily. We should migrate other pages if MPOL_MF_MOVE* is specified. Set has_unmovable flag if some paged could not be not moved, then return -EIO for mbind() eventually. With this change the above test would return -EIO as expected. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-3-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-3-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:18 +00:00
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return false;
vmscan: move isolate_lru_page() to vmscan.c On large memory systems, the VM can spend way too much time scanning through pages that it cannot (or should not) evict from memory. Not only does it use up CPU time, but it also provokes lock contention and can leave large systems under memory presure in a catatonic state. This patch series improves VM scalability by: 1) putting filesystem backed, swap backed and unevictable pages onto their own LRUs, so the system only scans the pages that it can/should evict from memory 2) switching to two handed clock replacement for the anonymous LRUs, so the number of pages that need to be scanned when the system starts swapping is bound to a reasonable number 3) keeping unevictable pages off the LRU completely, so the VM does not waste CPU time scanning them. ramfs, ramdisk, SHM_LOCKED shared memory segments and mlock()ed VMA pages are keept on the unevictable list. This patch: isolate_lru_page logically belongs to be in vmscan.c than migrate.c. It is tough, because we don't need that function without memory migration so there is a valid argument to have it in migrate.c. However a subsequent patch needs to make use of it in the core mm, so we can happily move it to vmscan.c. Also, make the function a little more generic by not requiring that it adds an isolated page to a given list. Callers can do that. Note that we now have '__isolate_lru_page()', that does something quite different, visible outside of vmscan.c for use with memory controller. Methinks we need to rationalize these names/purposes. --lts [akpm@linux-foundation.org: fix mm/memory_hotplug.c build] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 03:26:09 +00:00
}
}
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return true;
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
struct vm_area_struct *vma;
LIST_HEAD(pagelist);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
long nr_failed;
long err = 0;
struct migration_target_control mtc = {
.nid = dest,
.gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
};
nodes_clear(nmask);
node_set(source, nmask);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
mmap_read_lock(mm);
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
vma = find_vma(mm, 0);
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* This does not migrate the range, but isolates all pages that
* need migration. Between passing in the full user address
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
* but passes back the count of pages which could not be isolated.
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
mmap_read_unlock(mm);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL,
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
if (err)
putback_movable_pages(&pagelist);
}
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (err >= 0)
err += nr_failed;
return err;
}
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
/*
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
*
* Returns the number of page that could not be moved.
*/
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
{
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
long nr_failed = 0;
long err = 0;
nodemask_t tmp;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
lru_cache_disable();
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
*
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
*
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
*
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
*
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
*/
tmp = *from;
while (!nodes_empty(tmp)) {
mm/mempool: minor coding style tweaks Various coding style tweaks to various files under mm/ [daizhiyuan@phytium.com.cn: mm/swapfile: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614223624-16055-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/sparse: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227288-19363-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmscan: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227649-19853-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/compaction: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228218-20770-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/oom_kill: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228360-21168-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/shmem: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228504-21491-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/page_alloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228613-21754-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/filemap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228936-22337-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mlock: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613956588-2453-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/frontswap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613962668-15045-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmalloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613963379-15988-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/memory_hotplug: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613971784-24878-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mempolicy: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613972228-25501-1-git-send-email-daizhiyuan@phytium.com.cn Link: https://lkml.kernel.org/r/1614222374-13805-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai <daizhiyuan@phytium.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-05 01:40:12 +00:00
int s, d;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
mm: do_migrate_pages() calls migrate_to_node() even if task is already on a correct node While running an application that moves tasks from one cpuset to another I noticed that it takes much longer and moves many more pages than expected. The reason for this is do_migrate_pages() does its best to preserve the relative node differential from the first node of the cpuset because the application may have been written with that in mind. If memory was interleaved on the nodes of the source cpuset by an application do_migrate_pages() will try its best to maintain that interleaving on the nodes of the destination cpuset. This means copying the memory from all source nodes to the destination nodes even if the source and destination nodes overlap. This is a problem for userspace NUMA placement tools. The amount of time spent doing extra memory moves cancels out some of the NUMA performance improvements. Furthermore, if the number of source and destination nodes are to maintain the previous interleaving layout anyway. This patch changes do_migrate_pages() to only preserve the relative layout inside the program if the number of NUMA nodes in the source and destination mask are the same. If the number is different, we do a much more efficient migration by not touching memory that is in an allowed node. This preserves the old behaviour for programs that want it, while allowing a userspace NUMA placement tool to use the new, faster migration. This improves performance in our tests by up to a factor of 7. Without this change migrating tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 3-4, we migrate from ALL the nodes even if they are in the both the source and destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 4 to 3 Migrating 1 to 4 Migrating 3 to 4 Migrating 0 to 3 Migrating 2 to 3 With this change we only migrate from nodes that are not in the destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 2 to 3 Migrating 1 to 4 Migrating 0 to 3 Yet if we move from a cpuset containing nodes 2,3,4 to a cpuset containing 3,4,5 we still do move everything so that we preserve the desired NUMA offsets: Migrating 4 to 5 Migrating 3 to 4 Migrating 2 to 3 As far as performance is concerned this simple patch improves the time it takes to move 14, 20 and 26 large tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 1 & 3 by up to a factor of 7. Here are the timings with and without the patch: BEFORE PATCH -- Move times: 59, 140, 651 seconds ============ Moving 14 tasks from nodes (0-7) to nodes (1,3) numad(8780) do_migrate_pages (mm=0xffff88081d414400 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x7 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x5 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x4 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x2 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x1 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 8890 moved to node(s) 1,3 in 59.2 seconds Moving 20 tasks from nodes (0-7) to nodes (1,4-5) numad(8780) do_migrate_pages (mm=0xffff88081d88c700 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x7 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x3 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x2 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x1 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 8962 moved to node(s) 1,4-5 in 139.88 seconds Moving 26 tasks from nodes (0-7) to nodes (1-3,5) numad(8780) do_migrate_pages (mm=0xffff88081d5bc740 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x7 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x6 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x5 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x3 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x2 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x1 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x0 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x4 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 9058 moved to node(s) 1-3,5 in 651.45 seconds AFTER PATCH -- Move times: 42, 56, 93 seconds =========== Moving 14 tasks from nodes (0-7) to nodes (5,7) numad(33209) do_migrate_pages (mm=0xffff88101d5ff140 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x4 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x3 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x1 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x0 dest=0x5 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 33221 moved to node(s) 5,7 in 41.67 seconds Moving 20 tasks from nodes (0-7) to nodes (1,3,5) numad(33209) do_migrate_pages (mm=0xffff88101d6c37c0 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x7 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x6 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x4 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 33289 moved to node(s) 1,3,5 in 56.3 seconds Moving 26 tasks from nodes (0-7) to nodes (1,3,5,7) numad(33209) do_migrate_pages (mm=0xffff88101d924400 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x4 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 33372 moved to node(s) 1,3,5,7 in 92.67 seconds [akpm@linux-foundation.org: clean up comment layout] Signed-off-by: Larry Woodman <lwoodman@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-29 22:06:24 +00:00
/*
* do_migrate_pages() tries to maintain the relative
* node relationship of the pages established between
* threads and memory areas.
*
* However if the number of source nodes is not equal to
* the number of destination nodes we can not preserve
* this node relative relationship. In that case, skip
* copying memory from a node that is in the destination
* mask.
*
* Example: [2,3,4] -> [3,4,5] moves everything.
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
*/
if ((nodes_weight(*from) != nodes_weight(*to)) &&
(node_isset(s, *to)))
mm: do_migrate_pages() calls migrate_to_node() even if task is already on a correct node While running an application that moves tasks from one cpuset to another I noticed that it takes much longer and moves many more pages than expected. The reason for this is do_migrate_pages() does its best to preserve the relative node differential from the first node of the cpuset because the application may have been written with that in mind. If memory was interleaved on the nodes of the source cpuset by an application do_migrate_pages() will try its best to maintain that interleaving on the nodes of the destination cpuset. This means copying the memory from all source nodes to the destination nodes even if the source and destination nodes overlap. This is a problem for userspace NUMA placement tools. The amount of time spent doing extra memory moves cancels out some of the NUMA performance improvements. Furthermore, if the number of source and destination nodes are to maintain the previous interleaving layout anyway. This patch changes do_migrate_pages() to only preserve the relative layout inside the program if the number of NUMA nodes in the source and destination mask are the same. If the number is different, we do a much more efficient migration by not touching memory that is in an allowed node. This preserves the old behaviour for programs that want it, while allowing a userspace NUMA placement tool to use the new, faster migration. This improves performance in our tests by up to a factor of 7. Without this change migrating tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 3-4, we migrate from ALL the nodes even if they are in the both the source and destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 4 to 3 Migrating 1 to 4 Migrating 3 to 4 Migrating 0 to 3 Migrating 2 to 3 With this change we only migrate from nodes that are not in the destination nodesets: Migrating 7 to 4 Migrating 6 to 3 Migrating 5 to 4 Migrating 2 to 3 Migrating 1 to 4 Migrating 0 to 3 Yet if we move from a cpuset containing nodes 2,3,4 to a cpuset containing 3,4,5 we still do move everything so that we preserve the desired NUMA offsets: Migrating 4 to 5 Migrating 3 to 4 Migrating 2 to 3 As far as performance is concerned this simple patch improves the time it takes to move 14, 20 and 26 large tasks from a cpuset containing nodes 0-7 to a cpuset containing nodes 1 & 3 by up to a factor of 7. Here are the timings with and without the patch: BEFORE PATCH -- Move times: 59, 140, 651 seconds ============ Moving 14 tasks from nodes (0-7) to nodes (1,3) numad(8780) do_migrate_pages (mm=0xffff88081d414400 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x7 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x5 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x4 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x2 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x1 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d414400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 8890 moved to node(s) 1,3 in 59.2 seconds Moving 20 tasks from nodes (0-7) to nodes (1,4-5) numad(8780) do_migrate_pages (mm=0xffff88081d88c700 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x7 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x6 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x3 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x2 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x1 dest=0x4 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d88c700 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 8962 moved to node(s) 1,4-5 in 139.88 seconds Moving 26 tasks from nodes (0-7) to nodes (1-3,5) numad(8780) do_migrate_pages (mm=0xffff88081d5bc740 from_nodes=0xffff880818c81d28 to_nodes=0xffff880818c81ce8 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x7 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x6 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x5 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x3 dest=0x5 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x2 dest=0x3 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x1 dest=0x2 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x0 dest=0x1 flags=0x4) numad(8780) migrate_to_node (mm=0xffff88081d5bc740 source=0x4 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 9058 moved to node(s) 1-3,5 in 651.45 seconds AFTER PATCH -- Move times: 42, 56, 93 seconds =========== Moving 14 tasks from nodes (0-7) to nodes (5,7) numad(33209) do_migrate_pages (mm=0xffff88101d5ff140 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x4 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x3 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x1 dest=0x7 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d5ff140 source=0x0 dest=0x5 flags=0x4) (Above moves repeated for each of the 14 tasks...) PID 33221 moved to node(s) 5,7 in 41.67 seconds Moving 20 tasks from nodes (0-7) to nodes (1,3,5) numad(33209) do_migrate_pages (mm=0xffff88101d6c37c0 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x7 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x6 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x4 dest=0x3 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d6c37c0 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 20 tasks...) PID 33289 moved to node(s) 1,3,5 in 56.3 seconds Moving 26 tasks from nodes (0-7) to nodes (1,3,5,7) numad(33209) do_migrate_pages (mm=0xffff88101d924400 from_nodes=0xffff88101e7b5d28 to_nodes=0xffff88101e7b5ce8 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x6 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x4 dest=0x1 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x2 dest=0x5 flags=0x4) numad(33209) migrate_to_node (mm=0xffff88101d924400 source=0x0 dest=0x1 flags=0x4) (Above moves repeated for each of the 26 tasks...) PID 33372 moved to node(s) 1,3,5,7 in 92.67 seconds [akpm@linux-foundation.org: clean up comment layout] Signed-off-by: Larry Woodman <lwoodman@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-05-29 22:06:24 +00:00
continue;
d = node_remap(s, *from, *to);
if (s == d)
continue;
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
break;
}
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
nr_failed += err;
if (err < 0)
break;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Chris Goldsworthy <cgoldswo@codeaurora.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: John Dias <joaodias@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Oliver Sang <oliver.sang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-05 01:36:54 +00:00
lru_cache_enable();
if (err < 0)
return err;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}
Migration: find correct vma in new_vma_page() We hit the BUG_ON() in mm/rmap.c:vma_address() when trying to migrate via mbind(MPOL_MF_MOVE) a non-anon region that spans multiple vmas. For anon-regions, we just fail to migrate any pages beyond the 1st vma in the range. This occurs because do_mbind() collects a list of pages to migrate by calling check_range(). check_range() walks the task's mm, spanning vmas as necessary, to collect the migratable pages into a list. Then, do_mbind() calls migrate_pages() passing the list of pages, a function to allocate new pages based on vma policy [new_vma_page()], and a pointer to the first vma of the range. For each page in the list, new_vma_page() calls page_address_in_vma() passing the page and the vma [first in range] to obtain the address to get for alloc_page_vma(). The page address is needed to get interleaving policy correct. If the pages in the list come from multiple vmas, eventually, new_page_address() will pass that page to page_address_in_vma() with the incorrect vma. For !PageAnon pages, this will result in a bug check in rmap.c:vma_address(). For anon pages, vma_address() will just return EFAULT and fail the migration. This patch modifies new_vma_page() to check the return value from page_address_in_vma(). If the return value is EFAULT, new_vma_page() searchs forward via vm_next for the vma that maps the page--i.e., that does not return EFAULT. This assumes that the pages in the list handed to migrate_pages() is in address order. This is currently case. The patch documents this assumption in a new comment block for new_vma_page(). If new_vma_page() cannot locate the vma mapping the page in a forward search in the mm, it will pass a NULL vma to alloc_page_vma(). This will result in the allocation using the task policy, if any, else system default policy. This situation is unlikely, but the patch documents this behavior with a comment. Note, this patch results in restarting from the first vma in a multi-vma range each time new_vma_page() is called. If this is not acceptable, we can make the vma argument a pointer, both in new_vma_page() and it's caller unmap_and_move() so that the value held by the loop in migrate_pages() always passes down the last vma in which a page was found. This will require changes to all new_page_t functions passed to migrate_pages(). Is this necessary? For this patch to work, we can't bug check in vma_address() for pages outside the argument vma. This patch removes the BUG_ON(). All other callers [besides new_vma_page()] already check the return status. Tested on x86_64, 4 node NUMA platform. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-11-15 00:59:10 +00:00
/*
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
* Allocate a new folio for page migration, according to NUMA mempolicy.
Migration: find correct vma in new_vma_page() We hit the BUG_ON() in mm/rmap.c:vma_address() when trying to migrate via mbind(MPOL_MF_MOVE) a non-anon region that spans multiple vmas. For anon-regions, we just fail to migrate any pages beyond the 1st vma in the range. This occurs because do_mbind() collects a list of pages to migrate by calling check_range(). check_range() walks the task's mm, spanning vmas as necessary, to collect the migratable pages into a list. Then, do_mbind() calls migrate_pages() passing the list of pages, a function to allocate new pages based on vma policy [new_vma_page()], and a pointer to the first vma of the range. For each page in the list, new_vma_page() calls page_address_in_vma() passing the page and the vma [first in range] to obtain the address to get for alloc_page_vma(). The page address is needed to get interleaving policy correct. If the pages in the list come from multiple vmas, eventually, new_page_address() will pass that page to page_address_in_vma() with the incorrect vma. For !PageAnon pages, this will result in a bug check in rmap.c:vma_address(). For anon pages, vma_address() will just return EFAULT and fail the migration. This patch modifies new_vma_page() to check the return value from page_address_in_vma(). If the return value is EFAULT, new_vma_page() searchs forward via vm_next for the vma that maps the page--i.e., that does not return EFAULT. This assumes that the pages in the list handed to migrate_pages() is in address order. This is currently case. The patch documents this assumption in a new comment block for new_vma_page(). If new_vma_page() cannot locate the vma mapping the page in a forward search in the mm, it will pass a NULL vma to alloc_page_vma(). This will result in the allocation using the task policy, if any, else system default policy. This situation is unlikely, but the patch documents this behavior with a comment. Note, this patch results in restarting from the first vma in a multi-vma range each time new_vma_page() is called. If this is not acceptable, we can make the vma argument a pointer, both in new_vma_page() and it's caller unmap_and_move() so that the value held by the loop in migrate_pages() always passes down the last vma in which a page was found. This will require changes to all new_page_t functions passed to migrate_pages(). Is this necessary? For this patch to work, we can't bug check in vma_address() for pages outside the argument vma. This patch removes the BUG_ON(). All other callers [besides new_vma_page()] already check the return status. Tested on x86_64, 4 node NUMA platform. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-11-15 00:59:10 +00:00
*/
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
unsigned long private)
{
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
struct migration_mpol *mmpol = (struct migration_mpol *)private;
struct mempolicy *pol = mmpol->pol;
pgoff_t ilx = mmpol->ilx;
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
struct page *page;
unsigned int order;
int nid = numa_node_id();
gfp_t gfp;
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
order = folio_order(src);
ilx += src->index >> order;
mm/mempolicy: fix !vma in new_vma_page() BUG_ON(!vma) assumption is introduced by commit 0bf598d863e3 ("mbind: add BUG_ON(!vma) in new_vma_page()"), however, even if address = __vma_address(page, vma); and vma->start < address < vma->end page_address_in_vma() may still return -EFAULT because of many other conditions in it. As a result the while loop in new_vma_page() may end with vma=NULL. This patch revert the commit and also fix the potential dereference NULL pointer reported by Dan. http://marc.info/?l=linux-mm&m=137689530323257&w=2 kernel BUG at mm/mempolicy.c:1204! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU: 3 PID: 7056 Comm: trinity-child3 Not tainted 3.13.0-rc3+ #2 task: ffff8801ca5295d0 ti: ffff88005ab20000 task.ti: ffff88005ab20000 RIP: new_vma_page+0x70/0x90 RSP: 0000:ffff88005ab21db0 EFLAGS: 00010246 RAX: fffffffffffffff2 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000008040075 RSI: ffff8801c3d74600 RDI: ffffea00079a8b80 RBP: ffff88005ab21dc8 R08: 0000000000000004 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: fffffffffffffff2 R13: ffffea00079a8b80 R14: 0000000000400000 R15: 0000000000400000 FS: 00007ff49c6f4740(0000) GS:ffff880244e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ff49c68f994 CR3: 000000005a205000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffffea00079a8b80 ffffea00079a8bc0 ffffea00079a8ba0 ffff88005ab21e50 ffffffff811adc7a 0000000000000000 ffff8801ca5295d0 0000000464e224f8 0000000000000000 0000000000000002 0000000000000000 ffff88020ce75c00 Call Trace: migrate_pages+0x12a/0x850 SYSC_mbind+0x513/0x6a0 SyS_mbind+0xe/0x10 ia32_do_call+0x13/0x13 Code: 85 c0 75 2f 4c 89 e1 48 89 da 31 f6 bf da 00 02 00 65 44 8b 04 25 08 f7 1c 00 e8 ec fd ff ff 5b 41 5c 41 5d 5d c3 0f 1f 44 00 00 <0f> 0b 66 0f 1f 44 00 00 4c 89 e6 48 89 df ba 01 00 00 00 e8 48 RIP [<ffffffff8119f200>] new_vma_page+0x70/0x90 RSP <ffff88005ab21db0> Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com> Reported-by: Dave Jones <davej@redhat.com> Reported-by: Sasha Levin <sasha.levin@oracle.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reviewed-by: Bob Liu <bob.liu@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-12-19 01:08:56 +00:00
if (folio_test_hugetlb(src)) {
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
nodemask_t *nodemask;
struct hstate *h;
h = folio_hstate(src);
gfp = htlb_alloc_mask(h);
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
}
if (folio_test_large(src))
gfp = GFP_TRANSHUGE;
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
else
gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
return page_rmappable_folio(page);
}
#else
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags)
{
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
return false;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return -ENOSYS;
}
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
unsigned long private)
{
return NULL;
}
#endif
static long do_mbind(unsigned long start, unsigned long len,
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
struct vm_area_struct *vma, *prev;
struct vma_iterator vmi;
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
struct migration_mpol mmpol;
struct mempolicy *new;
unsigned long end;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
long err;
long nr_failed;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_disable();
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_write_lock(mm);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
err = mpol_set_nodemask(new, nmask, scratch);
if (err)
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_write_unlock(mm);
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
/*
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
* Lock the VMAs before scanning for pages to migrate,
* to ensure we don't miss a concurrently inserted page.
*/
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
nr_failed = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified When both MPOL_MF_MOVE* and MPOL_MF_STRICT was specified, mbind() should try best to migrate misplaced pages, if some of the pages could not be migrated, then return -EIO. There are three different sub-cases: 1. vma is not migratable 2. vma is migratable, but there are unmovable pages 3. vma is migratable, pages are movable, but migrate_pages() fails If #1 happens, kernel would just abort immediately, then return -EIO, after a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"). If #3 happens, kernel would set policy and migrate pages with best-effort, but won't rollback the migrated pages and reset the policy back. Before that commit, they behaves in the same way. It'd better to keep their behavior consistent. But, rolling back the migrated pages and resetting the policy back sounds not feasible, so just make #1 behave as same as #3. Userspace will know that not everything was successfully migrated (via -EIO), and can take whatever steps it deems necessary - attempt rollback, determine which exact page(s) are violating the policy, etc. Make queue_pages_range() return 1 to indicate there are unmovable pages or vma is not migratable. The #2 is not handled correctly in the current kernel, the following patch will fix it. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:15 +00:00
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (nr_failed < 0) {
err = nr_failed;
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
nr_failed = 0;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
} else {
vma_iter_init(&vmi, mm, start);
prev = vma_prev(&vmi);
for_each_vma_range(vmi, vma, end) {
err = mbind_range(&vmi, vma, &prev, start, end, new);
if (err)
break;
}
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
}
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
if (!err && !list_empty(&pagelist)) {
/* Convert MPOL_DEFAULT's NULL to task or default policy */
if (!new) {
new = get_task_policy(current);
mpol_get(new);
}
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
mmpol.pol = new;
mmpol.ilx = 0;
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
/*
* In the interleaved case, attempt to allocate on exactly the
* targeted nodes, for the first VMA to be migrated; for later
* VMAs, the nodes will still be interleaved from the targeted
* nodemask, but one by one may be selected differently.
*/
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (new->mode == MPOL_INTERLEAVE ||
new->mode == MPOL_WEIGHTED_INTERLEAVE) {
struct folio *folio;
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
unsigned int order;
unsigned long addr = -EFAULT;
list_for_each_entry(folio, &pagelist, lru) {
if (!folio_test_ksm(folio))
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
break;
}
if (!list_entry_is_head(folio, &pagelist, lru)) {
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
vma_iter_init(&vmi, mm, start);
for_each_vma_range(vmi, vma, end) {
addr = page_address_in_vma(
folio_page(folio, 0), vma);
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
if (addr != -EFAULT)
break;
}
}
if (addr != -EFAULT) {
order = folio_order(folio);
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
/* We already know the pol, but not the ilx */
mpol_cond_put(get_vma_policy(vma, addr, order,
&mmpol.ilx));
/* Set base from which to increment by index */
mmpol.ilx -= folio->index >> order;
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
}
}
mm: mempolicy: fix the wrong return value and potential pages leak of mbind Commit d883544515aa ("mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified") fixed the return value of mbind() for a couple of corner cases. But, it altered the errno for some other cases, for example, mbind() should return -EFAULT when part or all of the memory range specified by nodemask and maxnode points outside your accessible address space, or there was an unmapped hole in the specified memory range specified by addr and len. Fix this by preserving the errno returned by queue_pages_range(). And, the pagelist may be not empty even though queue_pages_range() returns error, put the pages back to LRU since mbind_range() is not called to really apply the policy so those pages should not be migrated, this is also the old behavior before the problematic commit. Link: http://lkml.kernel.org/r/1572454731-3925-1-git-send-email-yang.shi@linux.alibaba.com Fixes: d883544515aa ("mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified") Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reported-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Li Xinhai <lixinhai.lxh@gmail.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> [4.19 and 5.2+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-11-16 01:34:33 +00:00
}
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_write_unlock(mm);
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
if (!err && !list_empty(&pagelist)) {
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
nr_failed |= migrate_pages(&pagelist,
alloc_migration_target_by_mpol, NULL,
mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:29:00 +00:00
(unsigned long)&mmpol, MIGRATE_SYNC,
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
MR_MEMPOLICY_MBIND, NULL);
mm: mempolicy: fix the wrong return value and potential pages leak of mbind Commit d883544515aa ("mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified") fixed the return value of mbind() for a couple of corner cases. But, it altered the errno for some other cases, for example, mbind() should return -EFAULT when part or all of the memory range specified by nodemask and maxnode points outside your accessible address space, or there was an unmapped hole in the specified memory range specified by addr and len. Fix this by preserving the errno returned by queue_pages_range(). And, the pagelist may be not empty even though queue_pages_range() returns error, put the pages back to LRU since mbind_range() is not called to really apply the policy so those pages should not be migrated, this is also the old behavior before the problematic commit. Link: http://lkml.kernel.org/r/1572454731-3925-1-git-send-email-yang.shi@linux.alibaba.com Fixes: d883544515aa ("mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified") Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reported-by: Li Xinhai <lixinhai.lxh@gmail.com> Reviewed-by: Li Xinhai <lixinhai.lxh@gmail.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> [4.19 and 5.2+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-11-16 01:34:33 +00:00
}
mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:27:47 +00:00
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-03 09:17:43 +00:00
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified When both MPOL_MF_MOVE* and MPOL_MF_STRICT was specified, mbind() should try best to migrate misplaced pages, if some of the pages could not be migrated, then return -EIO. There are three different sub-cases: 1. vma is not migratable 2. vma is migratable, but there are unmovable pages 3. vma is migratable, pages are movable, but migrate_pages() fails If #1 happens, kernel would just abort immediately, then return -EIO, after a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"). If #3 happens, kernel would set policy and migrate pages with best-effort, but won't rollback the migrated pages and reset the policy back. Before that commit, they behaves in the same way. It'd better to keep their behavior consistent. But, rolling back the migrated pages and resetting the policy back sounds not feasible, so just make #1 behave as same as #3. Userspace will know that not everything was successfully migrated (via -EIO), and can take whatever steps it deems necessary - attempt rollback, determine which exact page(s) are violating the policy, etc. Make queue_pages_range() return 1 to indicate there are unmovable pages or vma is not migratable. The #2 is not handled correctly in the current kernel, the following patch will fix it. [yang.shi@linux.alibaba.com: fix review comments from Vlastimil] Link: http://lkml.kernel.org/r/1563556862-54056-2-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1561162809-59140-2-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-08-13 22:37:15 +00:00
mpol_out:
mpol_put(new);
mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim <minchan@kernel.org> Reviewed-by: Chris Goldsworthy <cgoldswo@codeaurora.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: John Dias <joaodias@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Oliver Sang <oliver.sang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-05-05 01:36:54 +00:00
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
lru_cache_enable();
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long nlongs = BITS_TO_LONGS(maxnode);
int ret;
if (in_compat_syscall())
ret = compat_get_bitmap(mask,
(const compat_ulong_t __user *)nmask,
maxnode);
else
ret = copy_from_user(mask, nmask,
nlongs * sizeof(unsigned long));
if (ret)
return -EFAULT;
if (maxnode % BITS_PER_LONG)
mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
return 0;
}
/* Copy a node mask from user space. */
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com> Reported-by: Tan Xiaojun <tanxiaojun@huawei.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Chris Salls <salls@cs.ucsb.edu> Cc: Christopher Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-01 00:16:11 +00:00
/*
* When the user specified more nodes than supported just check
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
* if the non supported part is all zero, one word at a time,
* starting at the end.
mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com> Reported-by: Tan Xiaojun <tanxiaojun@huawei.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Chris Salls <salls@cs.ucsb.edu> Cc: Christopher Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-01 00:16:11 +00:00
*/
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
while (maxnode > MAX_NUMNODES) {
unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
unsigned long t;
if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com> Reported-by: Tan Xiaojun <tanxiaojun@huawei.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Chris Salls <salls@cs.ucsb.edu> Cc: Christopher Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-01 00:16:11 +00:00
return -EFAULT;
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
if (maxnode - bits >= MAX_NUMNODES) {
maxnode -= bits;
} else {
maxnode = MAX_NUMNODES;
t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
}
if (t)
mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com> Reported-by: Tan Xiaojun <tanxiaojun@huawei.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Chris Salls <salls@cs.ucsb.edu> Cc: Christopher Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-02-01 00:16:11 +00:00
return -EINVAL;
}
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
numa: change get_mempolicy() to use nr_node_ids instead of MAX_NUMNODES The system call, get_mempolicy() [1], passes an unsigned long *nodemask pointer and an unsigned long maxnode argument which specifies the length of the user's nodemask array in bits (which is rounded up). The manual page says that if the maxnode value is too small, get_mempolicy will return EINVAL but there is no system call to return this minimum value. To determine this value, some programs search /proc/<pid>/status for a line starting with "Mems_allowed:" and use the number of digits in the mask to determine the minimum value. A recent change to the way this line is formatted [2] causes these programs to compute a value less than MAX_NUMNODES so get_mempolicy() returns EINVAL. Change get_mempolicy(), the older compat version of get_mempolicy(), and the copy_nodes_to_user() function to use nr_node_ids instead of MAX_NUMNODES, thus preserving the defacto method of computing the minimum size for the nodemask array and the maxnode argument. [1] http://man7.org/linux/man-pages/man2/get_mempolicy.2.html [2] https://lore.kernel.org/lkml/1545405631-6808-1-git-send-email-longman@redhat.com Link: http://lkml.kernel.org/r/20190211180245.22295-1-rcampbell@nvidia.com Fixes: 4fb8e5b89bcbbbb ("include/linux/nodemask.h: use nr_node_ids (not MAX_NUMNODES) in __nodemask_pr_numnodes()") Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> Suggested-by: Alexander Duyck <alexander.duyck@gmail.com> Cc: Waiman Long <longman@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-02-21 06:18:58 +00:00
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
bool compat = in_compat_syscall();
if (compat)
nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
maxnode = nr_node_ids;
}
mm: simplify compat numa syscalls The compat implementations for mbind, get_mempolicy, set_mempolicy and migrate_pages are just there to handle the subtly different layout of bitmaps on 32-bit hosts. The compat implementation however lacks some of the checks that are present in the native one, in particular for checking that the extra bits are all zero when user space has a larger mask size than the kernel. Worse, those extra bits do not get cleared when copying in or out of the kernel, which can lead to incorrect data as well. Unify the implementation to handle the compat bitmap layout directly in the get_nodes() and copy_nodes_to_user() helpers. Splitting out the get_bitmap() helper from get_nodes() also helps readability of the native case. On x86, two additional problems are addressed by this: compat tasks can pass a bitmap at the end of a mapping, causing a fault when reading across the page boundary for a 64-bit word. x32 tasks might also run into problems with get_mempolicy corrupting data when an odd number of 32-bit words gets passed. On parisc the migrate_pages() system call apparently had the wrong calling convention, as big-endian architectures expect the words inside of a bitmap to be swapped. This is not a problem though since parisc has no NUMA support. [arnd@arndb.de: fix mempolicy crash] Link: https://lkml.kernel.org/r/20210730143417.3700653-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/YQPLG20V3dmOfq3a@osiris/ Link: https://lkml.kernel.org/r/20210727144859.4150043-5-arnd@kernel.org Signed-off-by: Arnd Bergmann <arnd@arndb.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Helge Deller <deller@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Paul Mackerras <paulus@samba.org> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-08 22:18:21 +00:00
if (compat)
return compat_put_bitmap((compat_ulong_t __user *)mask,
nodes_addr(*nodes), maxnode);
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
*flags = *mode & MPOL_MODE_FLAGS;
*mode &= ~MPOL_MODE_FLAGS;
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
mm/mempolicy: advertise new MPOL_PREFERRED_MANY Adds a new mode to the existing mempolicy modes, MPOL_PREFERRED_MANY. MPOL_PREFERRED_MANY will be adequately documented in the internal admin-guide with this patch. Eventually, the man pages for mbind(2), get_mempolicy(2), set_mempolicy(2) and numactl(8) will also have text about this mode. Those shall contain the canonical reference. NUMA systems continue to become more prevalent. New technologies like PMEM make finer grain control over memory access patterns increasingly desirable. MPOL_PREFERRED_MANY allows userspace to specify a set of nodes that will be tried first when performing allocations. If those allocations fail, all remaining nodes will be tried. It's a straight forward API which solves many of the presumptive needs of system administrators wanting to optimize workloads on such machines. The mode will work either per VMA, or per thread. [Michal Hocko: refine kernel doc for MPOL_PREFERRED_MANY] Link: https://lore.kernel.org/r/20200630212517.308045-13-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-5-git-send-email-feng.tang@intel.com Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:16 +00:00
if ((unsigned int)(*mode) >= MPOL_MAX)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
mm/mempolicy: do not allow illegal MPOL_F_NUMA_BALANCING | MPOL_LOCAL in mbind() syzbot reported access to unitialized memory in mbind() [1] Issue came with commit bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") This commit added a new bit in MPOL_MODE_FLAGS, but only checked valid combination (MPOL_F_NUMA_BALANCING can only be used with MPOL_BIND) in do_set_mempolicy() This patch moves the check in sanitize_mpol_flags() so that it is also used by mbind() [1] BUG: KMSAN: uninit-value in __mpol_equal+0x567/0x590 mm/mempolicy.c:2260 __mpol_equal+0x567/0x590 mm/mempolicy.c:2260 mpol_equal include/linux/mempolicy.h:105 [inline] vma_merge+0x4a1/0x1e60 mm/mmap.c:1190 mbind_range+0xcc8/0x1e80 mm/mempolicy.c:811 do_mbind+0xf42/0x15f0 mm/mempolicy.c:1333 kernel_mbind mm/mempolicy.c:1483 [inline] __do_sys_mbind mm/mempolicy.c:1490 [inline] __se_sys_mbind+0x437/0xb80 mm/mempolicy.c:1486 __x64_sys_mbind+0x19d/0x200 mm/mempolicy.c:1486 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: slab_alloc_node mm/slub.c:3221 [inline] slab_alloc mm/slub.c:3230 [inline] kmem_cache_alloc+0x751/0xff0 mm/slub.c:3235 mpol_new mm/mempolicy.c:293 [inline] do_mbind+0x912/0x15f0 mm/mempolicy.c:1289 kernel_mbind mm/mempolicy.c:1483 [inline] __do_sys_mbind mm/mempolicy.c:1490 [inline] __se_sys_mbind+0x437/0xb80 mm/mempolicy.c:1486 __x64_sys_mbind+0x19d/0x200 mm/mempolicy.c:1486 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae ===================================================== Kernel panic - not syncing: panic_on_kmsan set ... CPU: 0 PID: 15049 Comm: syz-executor.0 Tainted: G B 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1ff/0x28e lib/dump_stack.c:106 dump_stack+0x25/0x28 lib/dump_stack.c:113 panic+0x44f/0xdeb kernel/panic.c:232 kmsan_report+0x2ee/0x300 mm/kmsan/report.c:186 __msan_warning+0xd7/0x150 mm/kmsan/instrumentation.c:208 __mpol_equal+0x567/0x590 mm/mempolicy.c:2260 mpol_equal include/linux/mempolicy.h:105 [inline] vma_merge+0x4a1/0x1e60 mm/mmap.c:1190 mbind_range+0xcc8/0x1e80 mm/mempolicy.c:811 do_mbind+0xf42/0x15f0 mm/mempolicy.c:1333 kernel_mbind mm/mempolicy.c:1483 [inline] __do_sys_mbind mm/mempolicy.c:1490 [inline] __se_sys_mbind+0x437/0xb80 mm/mempolicy.c:1486 __x64_sys_mbind+0x19d/0x200 mm/mempolicy.c:1486 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Link: https://lkml.kernel.org/r/20211001215630.810592-1-eric.dumazet@gmail.com Fixes: bda420b98505 ("numa balancing: migrate on fault among multiple bound nodes") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-10-18 22:15:49 +00:00
if (*flags & MPOL_F_NUMA_BALANCING) {
if (*mode != MPOL_BIND)
return -EINVAL;
*flags |= (MPOL_F_MOF | MPOL_F_MORON);
}
return 0;
}
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
start = untagged_addr(start);
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
unsigned long, home_node, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
struct vm_area_struct *vma, *prev;
struct mempolicy *new, *old;
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
unsigned long end;
int err = -ENOENT;
VMA_ITERATOR(vmi, mm, start);
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
start = untagged_addr(start);
if (start & ~PAGE_MASK)
return -EINVAL;
/*
* flags is used for future extension if any.
*/
if (flags != 0)
return -EINVAL;
/*
* Check home_node is online to avoid accessing uninitialized
* NODE_DATA.
*/
if (home_node >= MAX_NUMNODES || !node_online(home_node))
return -EINVAL;
len = PAGE_ALIGN(len);
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
mmap_write_lock(mm);
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
prev = vma_prev(&vmi);
for_each_vma_range(vmi, vma, end) {
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
/*
* If any vma in the range got policy other than MPOL_BIND
* or MPOL_PREFERRED_MANY we return error. We don't reset
* the home node for vmas we already updated before.
*/
old = vma_policy(vma);
if (!old) {
prev = vma;
continue;
}
if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
err = -EOPNOTSUPP;
break;
}
new = mpol_dup(old);
if (IS_ERR(new)) {
err = PTR_ERR(new);
break;
}
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
vma_start_write(vma);
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
new->home_node = home_node;
mm/mempolicy: fix use-after-free of VMA iterator set_mempolicy_home_node() iterates over a list of VMAs and calls mbind_range() on each VMA, which also iterates over the singular list of the VMA passed in and potentially splits the VMA. Since the VMA iterator is not passed through, set_mempolicy_home_node() may now point to a stale node in the VMA tree. This can result in a UAF as reported by syzbot. Avoid the stale maple tree node by passing the VMA iterator through to the underlying call to split_vma(). mbind_range() is also overly complicated, since there are two calling functions and one already handles iterating over the VMAs. Simplify mbind_range() to only handle merging and splitting of the VMAs. Align the new loop in do_mbind() and existing loop in set_mempolicy_home_node() to use the reduced mbind_range() function. This allows for a single location of the range calculation and avoids constantly looking up the previous VMA (since this is a loop over the VMAs). Link: https://lore.kernel.org/linux-mm/000000000000c93feb05f87e24ad@google.com/ Fixes: 66850be55e8e ("mm/mempolicy: use vma iterator & maple state instead of vma linked list") Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reported-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20230410152205.2294819-1-Liam.Howlett@oracle.com Tested-by: syzbot+a7c1ec5b1d71ceaa5186@syzkaller.appspotmail.com Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-10 15:22:05 +00:00
err = mbind_range(&vmi, vma, &prev, start, end, new);
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
mpol_put(new);
if (err)
break;
}
mmap_write_unlock(mm);
return err;
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
{
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned short mode_flags;
nodemask_t nodes;
int lmode = mode;
int err;
err = sanitize_mpol_flags(&lmode, &mode_flags);
if (err)
return err;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
return kernel_set_mempolicy(mode, nmask, maxnode);
}
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
{
struct mm_struct *mm = NULL;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
struct task_struct *task;
nodemask_t task_nodes;
int err;
nodemask_t *old;
nodemask_t *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
return -ENOMEM;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
old = &scratch->mask1;
new = &scratch->mask2;
err = get_nodes(old, old_nodes, maxnode);
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
if (err)
goto out;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
err = get_nodes(new, new_nodes, maxnode);
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
if (err)
goto out;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
if (!task) {
rcu_read_unlock();
err = -ESRCH;
goto out;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
get_task_struct(task);
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
err = -EINVAL;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
/*
* Check if this process has the right to modify the specified process.
* Use the regular "ptrace_may_access()" checks.
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
err = -EPERM;
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
goto out_put;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
rcu_read_unlock();
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
err = -EPERM;
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
goto out_put;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
task_nodes = cpuset_mems_allowed(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
err = security_task_movememory(task);
if (err)
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
goto out_put;
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
mm = get_task_mm(task);
put_task_struct(task);
mm: fix NULL ptr dereference in migrate_pages Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has added an odd construct where 'mm' is checked for being NULL, and if it is, it would get dereferenced anyways by mput()ing it. This would lead to the following NULL ptr deref and BUG() when calling migrate_pages() with a pid that has no mm struct: [25904.193704] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [25904.194235] IP: [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] PGD 773e6067 PUD 77da0067 PMD 0 [25904.194235] Oops: 0002 [#1] PREEMPT SMP [25904.194235] CPU 2 [25904.194235] Pid: 31608, comm: trinity Tainted: G W 3.4.0-rc2-next-20120412-sasha #69 [25904.194235] RIP: 0010:[<ffffffff810b0de7>] [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] RSP: 0018:ffff880077d49e08 EFLAGS: 00010202 [25904.194235] RAX: 0000000000000286 RBX: 0000000000000000 RCX: 0000000000000000 [25904.194235] RDX: ffff880075ef8000 RSI: 000000000000023d RDI: 0000000000000286 [25904.194235] RBP: ffff880077d49e18 R08: 0000000000000001 R09: 0000000000000001 [25904.194235] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [25904.194235] R13: 00000000ffffffea R14: ffff880034287740 R15: ffff8800218d3010 [25904.194235] FS: 00007fc8b244c700(0000) GS:ffff880029800000(0000) knlGS:0000000000000000 [25904.194235] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [25904.194235] CR2: 0000000000000050 CR3: 00000000767c6000 CR4: 00000000000406e0 [25904.194235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [25904.194235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [25904.194235] Process trinity (pid: 31608, threadinfo ffff880077d48000, task ffff880075ef8000) [25904.194235] Stack: [25904.194235] ffff8800342876c0 0000000000000000 ffff880077d49f78 ffffffff811b8020 [25904.194235] ffffffff811b7d91 ffff880075ef8000 ffff88002256d200 0000000000000000 [25904.194235] 00000000000003ff 0000000000000000 0000000000000000 0000000000000000 [25904.194235] Call Trace: [25904.194235] [<ffffffff811b8020>] sys_migrate_pages+0x340/0x3a0 [25904.194235] [<ffffffff811b7d91>] ? sys_migrate_pages+0xb1/0x3a0 [25904.194235] [<ffffffff8266cbb9>] system_call_fastpath+0x16/0x1b [25904.194235] Code: c9 c3 66 90 55 31 d2 48 89 e5 be 3d 02 00 00 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 89 fb 48 c7 c7 cf 0e e1 82 e8 69 18 03 00 <f0> ff 4b 50 0f 94 c0 84 c0 0f 84 aa 00 00 00 48 89 df e8 72 f1 [25904.194235] RIP [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] RSP <ffff880077d49e08> [25904.194235] CR2: 0000000000000050 [25904.348999] ---[ end trace a307b3ed40206b4b ]--- Signed-off-by: Sasha Levin <levinsasha928@gmail.com> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-04-25 23:01:52 +00:00
if (!mm) {
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
err = -EINVAL;
mm: fix NULL ptr dereference in migrate_pages Commit 3268c63 ("mm: fix move/migrate_pages() race on task struct") has added an odd construct where 'mm' is checked for being NULL, and if it is, it would get dereferenced anyways by mput()ing it. This would lead to the following NULL ptr deref and BUG() when calling migrate_pages() with a pid that has no mm struct: [25904.193704] BUG: unable to handle kernel NULL pointer dereference at 0000000000000050 [25904.194235] IP: [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] PGD 773e6067 PUD 77da0067 PMD 0 [25904.194235] Oops: 0002 [#1] PREEMPT SMP [25904.194235] CPU 2 [25904.194235] Pid: 31608, comm: trinity Tainted: G W 3.4.0-rc2-next-20120412-sasha #69 [25904.194235] RIP: 0010:[<ffffffff810b0de7>] [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] RSP: 0018:ffff880077d49e08 EFLAGS: 00010202 [25904.194235] RAX: 0000000000000286 RBX: 0000000000000000 RCX: 0000000000000000 [25904.194235] RDX: ffff880075ef8000 RSI: 000000000000023d RDI: 0000000000000286 [25904.194235] RBP: ffff880077d49e18 R08: 0000000000000001 R09: 0000000000000001 [25904.194235] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [25904.194235] R13: 00000000ffffffea R14: ffff880034287740 R15: ffff8800218d3010 [25904.194235] FS: 00007fc8b244c700(0000) GS:ffff880029800000(0000) knlGS:0000000000000000 [25904.194235] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [25904.194235] CR2: 0000000000000050 CR3: 00000000767c6000 CR4: 00000000000406e0 [25904.194235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [25904.194235] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [25904.194235] Process trinity (pid: 31608, threadinfo ffff880077d48000, task ffff880075ef8000) [25904.194235] Stack: [25904.194235] ffff8800342876c0 0000000000000000 ffff880077d49f78 ffffffff811b8020 [25904.194235] ffffffff811b7d91 ffff880075ef8000 ffff88002256d200 0000000000000000 [25904.194235] 00000000000003ff 0000000000000000 0000000000000000 0000000000000000 [25904.194235] Call Trace: [25904.194235] [<ffffffff811b8020>] sys_migrate_pages+0x340/0x3a0 [25904.194235] [<ffffffff811b7d91>] ? sys_migrate_pages+0xb1/0x3a0 [25904.194235] [<ffffffff8266cbb9>] system_call_fastpath+0x16/0x1b [25904.194235] Code: c9 c3 66 90 55 31 d2 48 89 e5 be 3d 02 00 00 48 83 ec 10 48 89 1c 24 4c 89 64 24 08 48 89 fb 48 c7 c7 cf 0e e1 82 e8 69 18 03 00 <f0> ff 4b 50 0f 94 c0 84 c0 0f 84 aa 00 00 00 48 89 df e8 72 f1 [25904.194235] RIP [<ffffffff810b0de7>] mmput+0x27/0xf0 [25904.194235] RSP <ffff880077d49e08> [25904.194235] CR2: 0000000000000050 [25904.348999] ---[ end trace a307b3ed40206b4b ]--- Signed-off-by: Sasha Levin <levinsasha928@gmail.com> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-04-25 23:01:52 +00:00
goto out;
}
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
return err;
mm: fix move/migrate_pages() race on task struct Migration functions perform the rcu_read_unlock too early. As a result the task pointed to may change from under us. This can result in an oops, as reported by Dave Hansen in https://lkml.org/lkml/2012/2/23/302. The following patch extend the period of the rcu_read_lock until after the permissions checks are done. We also take a refcount so that the task reference is stable when calling security check functions and performing cpuset node validation (which takes a mutex). The refcount is dropped before actual page migration occurs so there is no change to the refcounts held during page migration. Also move the determination of the mm of the task struct to immediately before the do_migrate*() calls so that it is clear that we switch from handling the task during permission checks to the mm for the actual migration. Since the determination is only done once and we then no longer use the task_struct we can be sure that we operate on a specific address space that will not change from under us. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Christoph Lameter <cl@linux.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:06 +00:00
out_put:
put_task_struct(task);
goto out;
[PATCH] Swap Migration V5: sys_migrate_pages interface sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on linux-mm@kvack.org and linux-kernel@vger.kernel.org. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:00:51 +00:00
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-03 20:09:38 +00:00
int pval;
nodemask_t nodes;
numa: change get_mempolicy() to use nr_node_ids instead of MAX_NUMNODES The system call, get_mempolicy() [1], passes an unsigned long *nodemask pointer and an unsigned long maxnode argument which specifies the length of the user's nodemask array in bits (which is rounded up). The manual page says that if the maxnode value is too small, get_mempolicy will return EINVAL but there is no system call to return this minimum value. To determine this value, some programs search /proc/<pid>/status for a line starting with "Mems_allowed:" and use the number of digits in the mask to determine the minimum value. A recent change to the way this line is formatted [2] causes these programs to compute a value less than MAX_NUMNODES so get_mempolicy() returns EINVAL. Change get_mempolicy(), the older compat version of get_mempolicy(), and the copy_nodes_to_user() function to use nr_node_ids instead of MAX_NUMNODES, thus preserving the defacto method of computing the minimum size for the nodemask array and the maxnode argument. [1] http://man7.org/linux/man-pages/man2/get_mempolicy.2.html [2] https://lore.kernel.org/lkml/1545405631-6808-1-git-send-email-longman@redhat.com Link: http://lkml.kernel.org/r/20190211180245.22295-1-rcampbell@nvidia.com Fixes: 4fb8e5b89bcbbbb ("include/linux/nodemask.h: use nr_node_ids (not MAX_NUMNODES) in __nodemask_pr_numnodes()") Signed-off-by: Ralph Campbell <rcampbell@nvidia.com> Suggested-by: Alexander Duyck <alexander.duyck@gmail.com> Cc: Waiman Long <longman@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-02-21 06:18:58 +00:00
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
addr = untagged_addr(addr);
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long __user *, nmask, unsigned long, maxnode,
unsigned long, addr, unsigned long, flags)
{
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
mm/mempolicy: check hugepage migration is supported by arch in vma_migratable() vma_migratable() is called to check if pages in vma can be migrated before go ahead to further actions. Currently it is used in below code path: - task_numa_work - mbind - move_pages For hugetlb mapping, whether vma is migratable or not is determined by: - CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION - arch_hugetlb_migration_supported Issue: current code only checks for CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION alone, and no code should use it directly. (note that current code in vma_migratable don't cause failure or bug because unmap_and_move_huge_page() will catch unsupported hugepage and handle it properly) This patch checks the two factors by hugepage_migration_supported for impoving code logic and robustness. It will enable early bail out of hugepage migration procedure, but because currently all architecture supporting hugepage migration is able to support all page size, we would not see performance gain with this patch applied. vma_migratable() is moved to mm/mempolicy.c, because of the circular reference of mempolicy.h and hugetlb.h cause defining it as inline not feasible. Signed-off-by: Li Xinhai <lixinhai.lxh@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Link: http://lkml.kernel.org/r/1579786179-30633-1-git-send-email-lixinhai.lxh@gmail.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 04:10:52 +00:00
bool vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
return false;
/*
* DAX device mappings require predictable access latency, so avoid
* incurring periodic faults.
*/
if (vma_is_dax(vma))
return false;
if (is_vm_hugetlb_page(vma) &&
!hugepage_migration_supported(hstate_vma(vma)))
return false;
/*
* Migration allocates pages in the highest zone. If we cannot
* do so then migration (at least from node to node) is not
* possible.
*/
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
return false;
return true;
}
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
unsigned long addr, pgoff_t *ilx)
{
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
*ilx = 0;
return (vma->vm_ops && vma->vm_ops->get_policy) ?
vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}
/*
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* get_vma_policy(@vma, @addr, @order, @ilx)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* @order: 0, or appropriate huge_page_order for interleaving
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
* @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
* MPOL_WEIGHTED_INTERLEAVE
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx)
{
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct mempolicy *pol;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (pol->mode == MPOL_INTERLEAVE ||
pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
return pol;
}
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pgoff_t ilx; /* ignored here */
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
return ret;
}
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
mm/hugetlb: add dedicated func to get 'allowed' nodemask for current process Muchun Song found that after MPOL_PREFERRED_MANY policy was introduced in commit b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes"), the policy_nodemask_current()'s semantics for this new policy has been changed, which returns 'preferred' nodes instead of 'allowed' nodes. With the changed semantic of policy_nodemask_current, a task with MPOL_PREFERRED_MANY policy could fail to get its reservation even though it can fall back to other nodes (either defined by cpusets or all online nodes) for that reservation failing mmap calles unnecessarily early. The fix is to not consider MPOL_PREFERRED_MANY for reservations at all because they, unlike MPOL_MBIND, do not pose any actual hard constrain. Michal suggested the policy_nodemask_current() is only used by hugetlb, and could be moved to hugetlb code with more explicit name to enforce the 'allowed' semantics for which only MPOL_BIND policy matters. apply_policy_zone() is made extern to be called in hugetlb code and its return value is changed to bool. [1]. https://lore.kernel.org/lkml/20220801084207.39086-1-songmuchun@bytedance.com/t/ Link: https://lkml.kernel.org/r/20220805005903.95563-1-feng.tang@intel.com Fixes: b27abaccf8e8 ("mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes") Signed-off-by: Feng Tang <feng.tang@intel.com> Reported-by: Muchun Song <songmuchun@bytedance.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Ben Widawsky <bwidawsk@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-08-05 00:59:03 +00:00
bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
* if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->nodes is intersect with node_states[N_MEMORY].
* so if the following test fails, it implies
* policy->nodes has movable memory only.
*/
if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
unsigned int node;
unsigned int cpuset_mems_cookie;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
retry:
/* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
cpuset_mems_cookie = read_mems_allowed_begin();
node = current->il_prev;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (!current->il_weight || !node_isset(node, policy->nodes)) {
node = next_node_in(node, policy->nodes);
if (read_mems_allowed_retry(cpuset_mems_cookie))
goto retry;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (node == MAX_NUMNODES)
return node;
current->il_prev = node;
current->il_weight = get_il_weight(node);
}
current->il_weight--;
return node;
}
/* Do dynamic interleaving for a process */
2023-10-03 09:20:14 +00:00
static unsigned int interleave_nodes(struct mempolicy *policy)
{
2023-10-03 09:20:14 +00:00
unsigned int nid;
unsigned int cpuset_mems_cookie;
/* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
do {
cpuset_mems_cookie = read_mems_allowed_begin();
nid = next_node_in(current->il_prev, policy->nodes);
} while (read_mems_allowed_retry(cpuset_mems_cookie));
2023-10-03 09:20:14 +00:00
if (nid < MAX_NUMNODES)
current->il_prev = nid;
return nid;
}
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
unsigned int mempolicy_slab_node(void)
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
{
struct mempolicy *policy;
int node = numa_mem_id();
if (!in_task())
return node;
policy = current->mempolicy;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
if (!policy)
return node;
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
switch (policy->mode) {
case MPOL_PREFERRED:
return first_node(policy->nodes);
[PATCH] GFP_THISNODE for the slab allocator This patch insures that the slab node lists in the NUMA case only contain slabs that belong to that specific node. All slab allocations use GFP_THISNODE when calling into the page allocator. If an allocation fails then we fall back in the slab allocator according to the zonelists appropriate for a certain context. This allows a replication of the behavior of alloc_pages and alloc_pages node in the slab layer. Currently allocations requested from the page allocator may be redirected via cpusets to other nodes. This results in remote pages on nodelists and that in turn results in interrupt latency issues during cache draining. Plus the slab is handing out memory as local when it is really remote. Fallback for slab memory allocations will occur within the slab allocator and not in the page allocator. This is necessary in order to be able to use the existing pools of objects on the nodes that we fall back to before adding more pages to a slab. The fallback function insures that the nodes we fall back to obey cpuset restrictions of the current context. We do not allocate objects from outside of the current cpuset context like before. Note that the implementation of locality constraints within the slab allocator requires importing logic from the page allocator. This is a mischmash that is not that great. Other allocators (uncached allocator, vmalloc, huge pages) face similar problems and have similar minimal reimplementations of the basic fallback logic of the page allocator. There is another way of implementing a slab by avoiding per node lists (see modular slab) but this wont work within the existing slab. V1->V2: - Use NUMA_BUILD to avoid #ifdef CONFIG_NUMA - Exploit GFP_THISNODE being 0 in the NON_NUMA case to avoid another #ifdef [akpm@osdl.org: build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 08:50:08 +00:00
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
return weighted_interleave_nodes(policy);
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_BIND:
case MPOL_PREFERRED_MANY:
{
struct zoneref *z;
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
mm: have zonelist contains structs with both a zone pointer and zone_idx Filtering zonelists requires very frequent use of zone_idx(). This is costly as it involves a lookup of another structure and a substraction operation. As the zone_idx is often required, it should be quickly accessible. The node idx could also be stored here if it was found that accessing zone->node is significant which may be the case on workloads where nodemasks are heavily used. This patch introduces a struct zoneref to store a zone pointer and a zone index. The zonelist then consists of an array of these struct zonerefs which are looked up as necessary. Helpers are given for accessing the zone index as well as the node index. [kamezawa.hiroyu@jp.fujitsu.com: Suggested struct zoneref instead of embedding information in pointers] [hugh@veritas.com: mm-have-zonelist: fix memcg ooms] [hugh@veritas.com: just return do_try_to_free_pages] [hugh@veritas.com: do_try_to_free_pages gfp_mask redundant] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Christoph Lameter <clameter@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Christoph Lameter <clameter@sgi.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:17 +00:00
}
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
case MPOL_LOCAL:
return node;
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
default:
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
BUG();
[PATCH] NUMA policies in the slab allocator V2 This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-19 01:42:36 +00:00
}
}
static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
nodemask_t *mask)
{
/*
* barrier stabilizes the nodemask locally so that it can be iterated
* over safely without concern for changes. Allocators validate node
* selection does not violate mems_allowed, so this is safe.
*/
barrier();
memcpy(mask, &pol->nodes, sizeof(nodemask_t));
barrier();
return nodes_weight(*mask);
}
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
nodemask_t nodemask;
unsigned int target, nr_nodes;
u8 *table;
unsigned int weight_total = 0;
u8 weight;
int nid;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
table = rcu_dereference(iw_table);
/* calculate the total weight */
for_each_node_mask(nid, nodemask) {
/* detect system default usage */
weight = table ? table[nid] : 1;
weight = weight ? weight : 1;
weight_total += weight;
}
/* Calculate the node offset based on totals */
target = ilx % weight_total;
nid = first_node(nodemask);
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
nid = next_node_in(nid, nodemask);
}
rcu_read_unlock();
return nid;
}
/*
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* Do static interleaving for interleave index @ilx. Returns the ilx'th
* node in pol->nodes (starting from ilx=0), wrapping around if ilx
* exceeds the number of present nodes.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
nodemask_t nodemask;
mm/mempolicy: fix a race between offset_il_node and mpol_rebind_task Servers happened below panic: Kernel version:5.4.56 BUG: unable to handle page fault for address: 0000000000002c48 RIP: 0010:__next_zones_zonelist+0x1d/0x40 Call Trace: __alloc_pages_nodemask+0x277/0x310 alloc_page_interleave+0x13/0x70 handle_mm_fault+0xf99/0x1390 __do_page_fault+0x288/0x500 do_page_fault+0x30/0x110 page_fault+0x3e/0x50 The reason for the panic is that MAX_NUMNODES is passed in the third parameter in __alloc_pages_nodemask(preferred_nid). So access to zonelist->zoneref->zone_idx in __next_zones_zonelist will cause a panic. In offset_il_node(), first_node() returns nid from pol->v.nodes, after this other threads may chang pol->v.nodes before next_node(). This race condition will let next_node return MAX_NUMNODES. So put pol->nodes in a local variable. The race condition is between offset_il_node and cpuset_change_task_nodemask: CPU0: CPU1: alloc_pages_vma() interleave_nid(pol,) offset_il_node(pol,) first_node(pol->v.nodes) cpuset_change_task_nodemask //nodes==0xc mpol_rebind_task mpol_rebind_policy mpol_rebind_nodemask(pol,nodes) //nodes==0x3 next_node(nid, pol->v.nodes)//return MAX_NUMNODES Link: https://lkml.kernel.org/r/20210906034658.48721-1-yanghui.def@bytedance.com Signed-off-by: yanghui <yanghui.def@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-09 01:10:20 +00:00
unsigned int target, nnodes;
int i;
int nid;
nnodes = read_once_policy_nodemask(pol, &nodemask);
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
if (!nnodes)
return numa_node_id();
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
target = ilx % nnodes;
mm/mempolicy: fix a race between offset_il_node and mpol_rebind_task Servers happened below panic: Kernel version:5.4.56 BUG: unable to handle page fault for address: 0000000000002c48 RIP: 0010:__next_zones_zonelist+0x1d/0x40 Call Trace: __alloc_pages_nodemask+0x277/0x310 alloc_page_interleave+0x13/0x70 handle_mm_fault+0xf99/0x1390 __do_page_fault+0x288/0x500 do_page_fault+0x30/0x110 page_fault+0x3e/0x50 The reason for the panic is that MAX_NUMNODES is passed in the third parameter in __alloc_pages_nodemask(preferred_nid). So access to zonelist->zoneref->zone_idx in __next_zones_zonelist will cause a panic. In offset_il_node(), first_node() returns nid from pol->v.nodes, after this other threads may chang pol->v.nodes before next_node(). This race condition will let next_node return MAX_NUMNODES. So put pol->nodes in a local variable. The race condition is between offset_il_node and cpuset_change_task_nodemask: CPU0: CPU1: alloc_pages_vma() interleave_nid(pol,) offset_il_node(pol,) first_node(pol->v.nodes) cpuset_change_task_nodemask //nodes==0xc mpol_rebind_task mpol_rebind_policy mpol_rebind_nodemask(pol,nodes) //nodes==0x3 next_node(nid, pol->v.nodes)//return MAX_NUMNODES Link: https://lkml.kernel.org/r/20210906034658.48721-1-yanghui.def@bytedance.com Signed-off-by: yanghui <yanghui.def@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-09 01:10:20 +00:00
nid = first_node(nodemask);
for (i = 0; i < target; i++)
mm/mempolicy: fix a race between offset_il_node and mpol_rebind_task Servers happened below panic: Kernel version:5.4.56 BUG: unable to handle page fault for address: 0000000000002c48 RIP: 0010:__next_zones_zonelist+0x1d/0x40 Call Trace: __alloc_pages_nodemask+0x277/0x310 alloc_page_interleave+0x13/0x70 handle_mm_fault+0xf99/0x1390 __do_page_fault+0x288/0x500 do_page_fault+0x30/0x110 page_fault+0x3e/0x50 The reason for the panic is that MAX_NUMNODES is passed in the third parameter in __alloc_pages_nodemask(preferred_nid). So access to zonelist->zoneref->zone_idx in __next_zones_zonelist will cause a panic. In offset_il_node(), first_node() returns nid from pol->v.nodes, after this other threads may chang pol->v.nodes before next_node(). This race condition will let next_node return MAX_NUMNODES. So put pol->nodes in a local variable. The race condition is between offset_il_node and cpuset_change_task_nodemask: CPU0: CPU1: alloc_pages_vma() interleave_nid(pol,) offset_il_node(pol,) first_node(pol->v.nodes) cpuset_change_task_nodemask //nodes==0xc mpol_rebind_task mpol_rebind_policy mpol_rebind_nodemask(pol,nodes) //nodes==0x3 next_node(nid, pol->v.nodes)//return MAX_NUMNODES Link: https://lkml.kernel.org/r/20210906034658.48721-1-yanghui.def@bytedance.com Signed-off-by: yanghui <yanghui.def@bytedance.com> Reviewed-by: Muchun Song <songmuchun@bytedance.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-09 01:10:20 +00:00
nid = next_node(nid, nodemask);
return nid;
}
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation, together with preferred node id (or the input node id).
*/
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
pgoff_t ilx, int *nid)
{
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nodemask_t *nodemask = NULL;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
switch (pol->mode) {
case MPOL_PREFERRED:
/* Override input node id */
*nid = first_node(pol->nodes);
break;
case MPOL_PREFERRED_MANY:
nodemask = &pol->nodes;
if (pol->home_node != NUMA_NO_NODE)
*nid = pol->home_node;
break;
case MPOL_BIND:
/* Restrict to nodemask (but not on lower zones) */
if (apply_policy_zone(pol, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&pol->nodes))
nodemask = &pol->nodes;
if (pol->home_node != NUMA_NO_NODE)
*nid = pol->home_node;
/*
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
WARN_ON_ONCE(gfp & __GFP_THISNODE);
break;
case MPOL_INTERLEAVE:
/* Override input node id */
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
interleave_nodes(pol) : interleave_nid(pol, ilx);
break;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
*nid = (ilx == NO_INTERLEAVE_INDEX) ?
weighted_interleave_nodes(pol) :
weighted_interleave_nid(pol, ilx);
break;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
}
return nodemask;
}
#ifdef CONFIG_HUGETLBFS
Fix NUMA Memory Policy Reference Counting This patch proposes fixes to the reference counting of memory policy in the page allocation paths and in show_numa_map(). Extracted from my "Memory Policy Cleanups and Enhancements" series as stand-alone. Shared policy lookup [shmem] has always added a reference to the policy, but this was never unrefed after page allocation or after formatting the numa map data. Default system policy should not require additional ref counting, nor should the current task's task policy. However, show_numa_map() calls get_vma_policy() to examine what may be [likely is] another task's policy. The latter case needs protection against freeing of the policy. This patch adds a reference count to a mempolicy returned by get_vma_policy() when the policy is a vma policy or another task's mempolicy. Again, shared policy is already reference counted on lookup. A matching "unref" [__mpol_free()] is performed in alloc_page_vma() for shared and vma policies, and in show_numa_map() for shared and another task's mempolicy. We can call __mpol_free() directly, saving an admittedly inexpensive inline NULL test, because we know we have a non-NULL policy. Handling policy ref counts for hugepages is a bit trickier. huge_zonelist() returns a zone list that might come from a shared or vma 'BIND policy. In this case, we should hold the reference until after the huge page allocation in dequeue_hugepage(). The patch modifies huge_zonelist() to return a pointer to the mempolicy if it needs to be unref'd after allocation. Kernel Build [16cpu, 32GB, ia64] - average of 10 runs: w/o patch w/ refcount patch Avg Std Devn Avg Std Devn Real: 100.59 0.38 100.63 0.43 User: 1209.60 0.37 1209.91 0.31 System: 81.52 0.42 81.64 0.34 Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 05:46:47 +00:00
/*
* huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
* @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
Fix NUMA Memory Policy Reference Counting This patch proposes fixes to the reference counting of memory policy in the page allocation paths and in show_numa_map(). Extracted from my "Memory Policy Cleanups and Enhancements" series as stand-alone. Shared policy lookup [shmem] has always added a reference to the policy, but this was never unrefed after page allocation or after formatting the numa map data. Default system policy should not require additional ref counting, nor should the current task's task policy. However, show_numa_map() calls get_vma_policy() to examine what may be [likely is] another task's policy. The latter case needs protection against freeing of the policy. This patch adds a reference count to a mempolicy returned by get_vma_policy() when the policy is a vma policy or another task's mempolicy. Again, shared policy is already reference counted on lookup. A matching "unref" [__mpol_free()] is performed in alloc_page_vma() for shared and vma policies, and in show_numa_map() for shared and another task's mempolicy. We can call __mpol_free() directly, saving an admittedly inexpensive inline NULL test, because we know we have a non-NULL policy. Handling policy ref counts for hugepages is a bit trickier. huge_zonelist() returns a zone list that might come from a shared or vma 'BIND policy. In this case, we should hold the reference until after the huge page allocation in dequeue_hugepage(). The patch modifies huge_zonelist() to return a pointer to the mempolicy if it needs to be unref'd after allocation. Kernel Build [16cpu, 32GB, ia64] - average of 10 runs: w/o patch w/ refcount patch Avg Std Devn Avg Std Devn Real: 100.59 0.38 100.63 0.43 User: 1209.60 0.37 1209.91 0.31 System: 81.52 0.42 81.64 0.34 Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 05:46:47 +00:00
*
* Returns a nid suitable for a huge page allocation and a pointer
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
* to the struct mempolicy for conditional unref after allocation.
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
* If the effective policy is 'bind' or 'prefer-many', returns a pointer
* to the mempolicy's @nodemask for filtering the zonelist.
Fix NUMA Memory Policy Reference Counting This patch proposes fixes to the reference counting of memory policy in the page allocation paths and in show_numa_map(). Extracted from my "Memory Policy Cleanups and Enhancements" series as stand-alone. Shared policy lookup [shmem] has always added a reference to the policy, but this was never unrefed after page allocation or after formatting the numa map data. Default system policy should not require additional ref counting, nor should the current task's task policy. However, show_numa_map() calls get_vma_policy() to examine what may be [likely is] another task's policy. The latter case needs protection against freeing of the policy. This patch adds a reference count to a mempolicy returned by get_vma_policy() when the policy is a vma policy or another task's mempolicy. Again, shared policy is already reference counted on lookup. A matching "unref" [__mpol_free()] is performed in alloc_page_vma() for shared and vma policies, and in show_numa_map() for shared and another task's mempolicy. We can call __mpol_free() directly, saving an admittedly inexpensive inline NULL test, because we know we have a non-NULL policy. Handling policy ref counts for hugepages is a bit trickier. huge_zonelist() returns a zone list that might come from a shared or vma 'BIND policy. In this case, we should hold the reference until after the huge page allocation in dequeue_hugepage(). The patch modifies huge_zonelist() to return a pointer to the mempolicy if it needs to be unref'd after allocation. Kernel Build [16cpu, 32GB, ia64] - average of 10 runs: w/o patch w/ refcount patch Avg Std Devn Avg Std Devn Real: 100.59 0.38 100.63 0.43 User: 1209.60 0.37 1209.91 0.31 System: 81.52 0.42 81.64 0.34 Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-09-19 05:46:47 +00:00
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct mempolicy **mpol, nodemask_t **nodemask)
{
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pgoff_t ilx;
int nid;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nid = numa_node_id();
*mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
*nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
return nid;
}
hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:58:21 +00:00
/*
* init_nodemask_of_mempolicy
*
* If the current task's mempolicy is "default" [NULL], return 'false'
* to indicate default policy. Otherwise, extract the policy nodemask
* for 'bind' or 'interleave' policy into the argument nodemask, or
* initialize the argument nodemask to contain the single node for
* 'preferred' or 'local' policy and return 'true' to indicate presence
* of non-default mempolicy.
*
* We don't bother with reference counting the mempolicy [mpol_get/put]
* because the current task is examining it's own mempolicy and a task's
* mempolicy is only ever changed by the task itself.
*
* N.B., it is the caller's responsibility to free a returned nodemask.
*/
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
if (!(mask && current->mempolicy))
return false;
cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:08 +00:00
task_lock(current);
hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:58:21 +00:00
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:58:21 +00:00
case MPOL_BIND:
case MPOL_INTERLEAVE:
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
*mask = mempolicy->nodes;
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
break;
case MPOL_LOCAL:
init_nodemask_of_node(mask, numa_node_id());
hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:58:21 +00:00
break;
default:
BUG();
}
cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:08 +00:00
task_unlock(current);
hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 01:58:21 +00:00
return true;
}
#endif
/*
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
* mempolicy_in_oom_domain
*
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
* If tsk's mempolicy is "bind", check for intersection between mask and
* the policy nodemask. Otherwise, return true for all other policies
* including "interleave", as a tsk with "interleave" policy may have
* memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
task_lock(tsk);
mempolicy = tsk->mempolicy;
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
if (mempolicy && mempolicy->mode == MPOL_BIND)
ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
mm/mempolicy: cleanup nodemask intersection check for oom Patch series "mm/mempolicy: some fix and semantics cleanup", v4. Current memory policy code has some confusing and ambiguous part about MPOL_LOCAL policy, as it is handled as a faked MPOL_PREFERRED one, and there are many places having to distinguish them. Also the nodemask intersection check needs cleanup to be more explicit for OOM use, and handle MPOL_INTERLEAVE correctly. This patchset cleans up these and unifies the parameter sanity check for mbind() and set_mempolicy(). This patch (of 3): mempolicy_nodemask_intersects seem to be a general purpose mempolicy function. In fact it is partially tailored for the OOM purpose instead. The oom proper is the only existing user so rename the function to make that purpose explicit. While at it drop the MPOL_INTERLEAVE as those allocations never has a nodemask defined (see alloc_page_interleave) so this is a dead code and a confusing one because MPOL_INTERLEAVE is a hint rather than a hard requirement so it shouldn't be considered during the OOM. The final code can be reduced to a check for MPOL_BIND which is the only memory policy that is a hard requirement and thus relevant to a constrained OOM logic. [mhocko@suse.com: changelog edits] Link: https://lkml.kernel.org/r/1622560492-1294-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622560492-1294-2-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-1-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/1622469956-82897-2-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:50:56 +00:00
return ret;
}
2021-09-02 22:00:10 +00:00
static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
int nid, nodemask_t *nodemask)
2021-09-02 22:00:10 +00:00
{
struct page *page;
gfp_t preferred_gfp;
/*
* This is a two pass approach. The first pass will only try the
* preferred nodes but skip the direct reclaim and allow the
* allocation to fail, while the second pass will try all the
* nodes in system.
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
page = __alloc_pages(preferred_gfp, order, nid, nodemask);
2021-09-02 22:00:10 +00:00
if (!page)
page = __alloc_pages(gfp, order, nid, NULL);
2021-09-02 22:00:10 +00:00
return page;
}
/**
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
* @gfp: GFP flags.
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* @order: Order of the page allocation.
* @pol: Pointer to the NUMA mempolicy.
* @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
* @nid: Preferred node (usually numa_node_id() but @mpol may override it).
*
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
* Return: The page on success or NULL if allocation fails.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nodemask_t *nodemask;
struct page *page;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
2021-09-02 22:00:10 +00:00
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_preferred_many(gfp, order, nid, nodemask);
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
/* filter "hugepage" allocation, unless from alloc_pages() */
order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
* If the policy is interleave or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
if (pol->mode != MPOL_INTERLEAVE &&
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
(!nodemask || node_isset(nid, *nodemask))) {
mm, thp: tweak reclaim/compaction effort of local-only and all-node allocations THP page faults now attempt a __GFP_THISNODE allocation first, which should only compact existing free memory, followed by another attempt that can allocate from any node using reclaim/compaction effort specified by global defrag setting and madvise. This patch makes the following changes to the scheme: - Before the patch, the first allocation relies on a check for pageblock order and __GFP_IO to prevent excessive reclaim. This however affects also the second attempt, which is not limited to single node. Instead of that, reuse the existing check for costly order __GFP_NORETRY allocations, and make sure the first THP attempt uses __GFP_NORETRY. As a side-effect, all costly order __GFP_NORETRY allocations will bail out if compaction needs reclaim, while previously they only bailed out when compaction was deferred due to previous failures. This should be still acceptable within the __GFP_NORETRY semantics. - Before the patch, the second allocation attempt (on all nodes) was passing __GFP_NORETRY. This is redundant as the check for pageblock order (discussed above) was stronger. It's also contrary to madvise(MADV_HUGEPAGE) which means some effort to allocate THP is requested. After this patch, the second attempt doesn't pass __GFP_THISNODE nor __GFP_NORETRY. To sum up, THP page faults now try the following attempts: 1. local node only THP allocation with no reclaim, just compaction. 2. for madvised VMA's or when synchronous compaction is enabled always - THP allocation from any node with effort determined by global defrag setting and VMA madvise 3. fallback to base pages on any node Link: http://lkml.kernel.org/r/08a3f4dd-c3ce-0009-86c5-9ee51aba8557@suse.cz Fixes: b39d0ee2632d ("mm, page_alloc: avoid expensive reclaim when compaction may not succeed") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 00:29:04 +00:00
/*
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
page = __alloc_pages_node(nid,
gfp | __GFP_THISNODE | __GFP_NORETRY, order);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
mm, page_alloc: allow hugepage fallback to remote nodes when madvised For systems configured to always try hard to allocate transparent hugepages (thp defrag setting of "always") or for memory that has been explicitly madvised to MADV_HUGEPAGE, it is often better to fallback to remote memory to allocate the hugepage if the local allocation fails first. The point is to allow the initial call to __alloc_pages_node() to attempt to defragment local memory to make a hugepage available, if possible, rather than immediately fallback to remote memory. Local hugepages will always have a better access latency than remote (huge)pages, so an attempt to make a hugepage available locally is always preferred. If memory compaction cannot be successful locally, however, it is likely better to fallback to remote memory. This could take on two forms: either allow immediate fallback to remote memory or do per-zone watermark checks. It would be possible to fallback only when per-zone watermarks fail for order-0 memory, since that would require local reclaim for all subsequent faults so remote huge allocation is likely better than thrashing the local zone for large workloads. In this case, it is assumed that because the system is configured to try hard to allocate hugepages or the vma is advised to explicitly want to try hard for hugepages that remote allocation is better when local allocation and memory compaction have both failed. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Stefan Priebe - Profihost AG <s.priebe@profihost.ag> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-04 19:54:25 +00:00
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
mm, thp: tweak reclaim/compaction effort of local-only and all-node allocations THP page faults now attempt a __GFP_THISNODE allocation first, which should only compact existing free memory, followed by another attempt that can allocate from any node using reclaim/compaction effort specified by global defrag setting and madvise. This patch makes the following changes to the scheme: - Before the patch, the first allocation relies on a check for pageblock order and __GFP_IO to prevent excessive reclaim. This however affects also the second attempt, which is not limited to single node. Instead of that, reuse the existing check for costly order __GFP_NORETRY allocations, and make sure the first THP attempt uses __GFP_NORETRY. As a side-effect, all costly order __GFP_NORETRY allocations will bail out if compaction needs reclaim, while previously they only bailed out when compaction was deferred due to previous failures. This should be still acceptable within the __GFP_NORETRY semantics. - Before the patch, the second allocation attempt (on all nodes) was passing __GFP_NORETRY. This is redundant as the check for pageblock order (discussed above) was stronger. It's also contrary to madvise(MADV_HUGEPAGE) which means some effort to allocate THP is requested. After this patch, the second attempt doesn't pass __GFP_THISNODE nor __GFP_NORETRY. To sum up, THP page faults now try the following attempts: 1. local node only THP allocation with no reclaim, just compaction. 2. for madvised VMA's or when synchronous compaction is enabled always - THP allocation from any node with effort determined by global defrag setting and VMA madvise 3. fallback to base pages on any node Link: http://lkml.kernel.org/r/08a3f4dd-c3ce-0009-86c5-9ee51aba8557@suse.cz Fixes: b39d0ee2632d ("mm, page_alloc: avoid expensive reclaim when compaction may not succeed") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-01-14 00:29:04 +00:00
* memory with both reclaim and compact as well.
mm, page_alloc: allow hugepage fallback to remote nodes when madvised For systems configured to always try hard to allocate transparent hugepages (thp defrag setting of "always") or for memory that has been explicitly madvised to MADV_HUGEPAGE, it is often better to fallback to remote memory to allocate the hugepage if the local allocation fails first. The point is to allow the initial call to __alloc_pages_node() to attempt to defragment local memory to make a hugepage available, if possible, rather than immediately fallback to remote memory. Local hugepages will always have a better access latency than remote (huge)pages, so an attempt to make a hugepage available locally is always preferred. If memory compaction cannot be successful locally, however, it is likely better to fallback to remote memory. This could take on two forms: either allow immediate fallback to remote memory or do per-zone watermark checks. It would be possible to fallback only when per-zone watermarks fail for order-0 memory, since that would require local reclaim for all subsequent faults so remote huge allocation is likely better than thrashing the local zone for large workloads. In this case, it is assumed that because the system is configured to try hard to allocate hugepages or the vma is advised to explicitly want to try hard for hugepages that remote allocation is better when local allocation and memory compaction have both failed. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Stefan Priebe - Profihost AG <s.priebe@profihost.ag> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-04 19:54:25 +00:00
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
}
}
mm, page_alloc: allow hugepage fallback to remote nodes when madvised For systems configured to always try hard to allocate transparent hugepages (thp defrag setting of "always") or for memory that has been explicitly madvised to MADV_HUGEPAGE, it is often better to fallback to remote memory to allocate the hugepage if the local allocation fails first. The point is to allow the initial call to __alloc_pages_node() to attempt to defragment local memory to make a hugepage available, if possible, rather than immediately fallback to remote memory. Local hugepages will always have a better access latency than remote (huge)pages, so an attempt to make a hugepage available locally is always preferred. If memory compaction cannot be successful locally, however, it is likely better to fallback to remote memory. This could take on two forms: either allow immediate fallback to remote memory or do per-zone watermark checks. It would be possible to fallback only when per-zone watermarks fail for order-0 memory, since that would require local reclaim for all subsequent faults so remote huge allocation is likely better than thrashing the local zone for large workloads. In this case, it is assumed that because the system is configured to try hard to allocate hugepages or the vma is advised to explicitly want to try hard for hugepages that remote allocation is better when local allocation and memory compaction have both failed. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Stefan Priebe - Profihost AG <s.priebe@profihost.ag> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-09-04 19:54:25 +00:00
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
page = __alloc_pages(gfp, order, nid, nodemask);
if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
if (static_branch_likely(&vm_numa_stat_key) &&
page_to_nid(page) == nid) {
preempt_disable();
__count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
Revert "mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask" This reverts commit 89c83fb539f95491be80cdd5158e6f0ce329e317. This should have been done as part of 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations"). The movement of the thp allocation policy from alloc_pages_vma() to alloc_hugepage_direct_gfpmask() was intended to only set __GFP_THISNODE for mempolicies that are not MPOL_BIND whereas the revert could set this regardless of mempolicy. While the check for MPOL_BIND between alloc_hugepage_direct_gfpmask() and alloc_pages_vma() was racy, that has since been removed since the revert. What is left is the possibility to use __GFP_THISNODE in policy_node() when it is unexpected because the special handling for hugepages in alloc_pages_vma() was removed as part of the consolidation. Secondly, prior to 89c83fb539f9, alloc_pages_vma() implemented a somewhat different policy for hugepage allocations, which were allocated through alloc_hugepage_vma(). For hugepage allocations, if the allocating process's node is in the set of allowed nodes, allocate with __GFP_THISNODE for that node (for MPOL_PREFERRED, use that node with __GFP_THISNODE instead). This was changed for shmem_alloc_hugepage() to allow fallback to other nodes in 89c83fb539f9 as it did for new_page() in mm/mempolicy.c which is functionally different behavior and removes the requirement to only allocate hugepages locally. So this commit does a full revert of 89c83fb539f9 instead of the partial revert that was done in 2f0799a0ffc0. The result is the same thp allocation policy for 4.20 that was in 4.19. Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") Fixes: 2f0799a0ffc0 ("mm, thp: restore node-local hugepage allocations") Signed-off-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-07 22:50:16 +00:00
}
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
return page;
}
/**
* vma_alloc_folio - Allocate a folio for a VMA.
* @gfp: GFP flags.
* @order: Order of the folio.
* @vma: Pointer to VMA.
* @addr: Virtual address of the allocation. Must be inside @vma.
* @hugepage: Unused (was: For hugepages try only preferred node if possible).
*
* Allocate a folio for a specific address in @vma, using the appropriate
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
* VMA to prevent it from going away. Should be used for all allocations
* for folios that will be mapped into user space, excepting hugetlbfs, and
* excepting where direct use of alloc_pages_mpol() is more appropriate.
*
* Return: The folio on success or NULL if allocation fails.
*/
struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
pgoff_t ilx;
struct page *page;
pol = get_vma_policy(vma, addr, order, &ilx);
page = alloc_pages_mpol(gfp | __GFP_COMP, order,
pol, ilx, numa_node_id());
mpol_cond_put(pol);
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
return page_rmappable_folio(page);
}
EXPORT_SYMBOL(vma_alloc_folio);
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
* @order: Power of two of number of pages to allocate.
*
* Allocate 1 << @order contiguous pages. The physical address of the
* first page is naturally aligned (eg an order-3 allocation will be aligned
* to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
* process is honoured when in process context.
*
* Context: Can be called from any context, providing the appropriate GFP
* flags are used.
* Return: The page on success or NULL if allocation fails.
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct page *alloc_pages(gfp_t gfp, unsigned int order)
{
struct mempolicy *pol = &default_policy;
mempolicy: rework mempolicy Reference Counting [yet again] After further discussion with Christoph Lameter, it has become clear that my earlier attempts to clean up the mempolicy reference counting were a bit of overkill in some areas, resulting in superflous ref/unref in what are usually fast paths. In other areas, further inspection reveals that I botched the unref for interleave policies. A separate patch, suitable for upstream/stable trees, fixes up the known errors in the previous attempt to fix reference counting. This patch reworks the memory policy referencing counting and, one hopes, simplifies the code. Maybe I'll get it right this time. See the update to the numa_memory_policy.txt document for a discussion of memory policy reference counting that motivates this patch. Summary: Lookup of mempolicy, based on (vma, address) need only add a reference for shared policy, and we need only unref the policy when finished for shared policies. So, this patch backs out all of the unneeded extra reference counting added by my previous attempt. It then unrefs only shared policies when we're finished with them, using the mpol_cond_put() [conditional put] helper function introduced by this patch. Note that shmem_swapin() calls read_swap_cache_async() with a dummy vma containing just the policy. read_swap_cache_async() can call alloc_page_vma() multiple times, so we can't let alloc_page_vma() unref the shared policy in this case. To avoid this, we make a copy of any non-null shared policy and remove the MPOL_F_SHARED flag from the copy. This copy occurs before reading a page [or multiple pages] from swap, so the overhead should not be an issue here. I introduced a new static inline function "mpol_cond_copy()" to copy the shared policy to an on-stack policy and remove the flags that would require a conditional free. The current implementation of mpol_cond_copy() assumes that the struct mempolicy contains no pointers to dynamically allocated structures that must be duplicated or reference counted during copy. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:16 +00:00
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 23:34:11 +00:00
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
return alloc_pages_mpol(gfp, order,
pol, NO_INTERLEAVE_INDEX, numa_node_id());
}
EXPORT_SYMBOL(alloc_pages);
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
struct folio *folio_alloc(gfp_t gfp, unsigned int order)
{
return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc);
mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun <chenwandun@huawei.com> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 20:39:53 +00:00
static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
int nodes;
unsigned long nr_pages_per_node;
int delta;
int i;
unsigned long nr_allocated;
unsigned long total_allocated = 0;
nodes = nodes_weight(pol->nodes);
nr_pages_per_node = nr_pages / nodes;
delta = nr_pages - nodes * nr_pages_per_node;
for (i = 0; i < nodes; i++) {
if (delta) {
nr_allocated = __alloc_pages_bulk(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node + 1, NULL,
page_array);
delta--;
} else {
nr_allocated = __alloc_pages_bulk(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node, NULL, page_array);
}
page_array += nr_allocated;
total_allocated += nr_allocated;
}
return total_allocated;
}
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
u8 *table, *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
int nnodes, node;
int resume_node = MAX_NUMNODES - 1;
u8 resume_weight = 0;
int prev_node;
int i;
if (!nr_pages)
return 0;
/* read the nodes onto the stack, retry if done during rebind */
do {
cpuset_mems_cookie = read_mems_allowed_begin();
nnodes = read_once_policy_nodemask(pol, &nodes);
} while (read_mems_allowed_retry(cpuset_mems_cookie));
/* if the nodemask has become invalid, we cannot do anything */
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (!nnodes)
return 0;
/* Continue allocating from most recent node and adjust the nr_pages */
node = me->il_prev;
weight = me->il_weight;
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
NULL, page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
if (rem_pages <= weight) {
me->il_weight -= rem_pages;
return total_allocated;
}
/* Otherwise we adjust remaining pages, continue from there */
rem_pages -= weight;
}
/* clear active weight in case of an allocation failure */
me->il_weight = 0;
prev_node = node;
/* create a local copy of node weights to operate on outside rcu */
weights = kzalloc(nr_node_ids, GFP_KERNEL);
if (!weights)
return total_allocated;
rcu_read_lock();
table = rcu_dereference(iw_table);
if (table)
memcpy(weights, table, nr_node_ids);
rcu_read_unlock();
/* calculate total, detect system default usage */
for_each_node_mask(node, nodes) {
if (!weights[node])
weights[node] = 1;
weight_total += weights[node];
}
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
* Track which node weighted interleave should resume from.
*
* if (rounds > 0) and (delta == 0), resume_node will always be
* the node following prev_node and its weight.
*/
rounds = rem_pages / weight_total;
delta = rem_pages % weight_total;
resume_node = next_node_in(prev_node, nodes);
resume_weight = weights[resume_node];
for (i = 0; i < nnodes; i++) {
node = next_node_in(prev_node, nodes);
weight = weights[node];
node_pages = weight * rounds;
/* If a delta exists, add this node's portion of the delta */
if (delta > weight) {
node_pages += weight;
delta -= weight;
} else if (delta) {
/* when delta is depleted, resume from that node */
node_pages += delta;
resume_node = node;
resume_weight = weight - delta;
delta = 0;
}
/* node_pages can be 0 if an allocation fails and rounds == 0 */
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
NULL, page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
break;
prev_node = node;
}
me->il_prev = resume_node;
me->il_weight = resume_weight;
kfree(weights);
return total_allocated;
}
mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun <chenwandun@huawei.com> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 20:39:53 +00:00
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
gfp_t preferred_gfp;
unsigned long nr_allocated = 0;
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
nr_pages, NULL, page_array);
if (nr_allocated < nr_pages)
nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
nr_pages - nr_allocated, NULL,
page_array + nr_allocated);
return nr_allocated;
}
/* alloc pages bulk and mempolicy should be considered at the
* same time in some situation such as vmalloc.
*
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nodemask_t *nodemask;
int nid;
mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun <chenwandun@huawei.com> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 20:39:53 +00:00
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
return alloc_pages_bulk_array_weighted_interleave(
gfp, pol, nr_pages, page_array);
mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun <chenwandun@huawei.com> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 20:39:53 +00:00
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return __alloc_pages_bulk(gfp, nid, nodemask,
nr_pages, NULL, page_array);
mm/vmalloc: introduce alloc_pages_bulk_array_mempolicy to accelerate memory allocation Commit ffb29b1c255a ("mm/vmalloc: fix numa spreading for large hash tables") can cause significant performance regressions in some situations as Andrew mentioned in [1]. The main situation is vmalloc, vmalloc will allocate pages with NUMA_NO_NODE by default, that will result in alloc page one by one; In order to solve this, __alloc_pages_bulk and mempolicy should be considered at the same time. 1) If node is specified in memory allocation request, it will alloc all pages by __alloc_pages_bulk. 2) If interleaving allocate memory, it will cauculate how many pages should be allocated in each node, and use __alloc_pages_bulk to alloc pages in each node. [1]: https://lore.kernel.org/lkml/CALvZod4G3SzP3kWxQYn0fj+VgG-G3yWXz=gz17+3N57ru1iajw@mail.gmail.com/t/#m750c8e3231206134293b089feaa090590afa0f60 [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: make two functions static] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20211021080744.874701-3-chenwandun@huawei.com Signed-off-by: Chen Wandun <chenwandun@huawei.com> Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-11-05 20:39:53 +00:00
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
2023-10-03 09:20:14 +00:00
struct mempolicy *pol = mpol_dup(src->vm_policy);
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
[PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:01:59 +00:00
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
[PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:01:59 +00:00
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
[PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:01:59 +00:00
*/
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
mempolicy: restructure rebinding-mempolicy functions Nick Piggin reported that the allocator may see an empty nodemask when changing cpuset's mems[1]. It happens only on the kernel that do not do atomic nodemask_t stores. (MAX_NUMNODES > BITS_PER_LONG) But I found that there is also a problem on the kernel that can do atomic nodemask_t stores. The problem is that the allocator can't find a node to alloc page when changing cpuset's mems though there is a lot of free memory. The reason is like this: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom I can use the attached program reproduce it by the following step: # mkdir /dev/cpuset # mount -t cpuset cpuset /dev/cpuset # mkdir /dev/cpuset/1 # echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus # echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems # echo $$ > /dev/cpuset/1/tasks # numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> & <nr_tasks> = max(nr_cpus - 1, 1) # killall -s SIGUSR1 cpuset_mem_hog # ./change_mems.sh several hours later, oom will happen though there is a lot of free memory. This patchset fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. This patch: In order to fix no node to alloc memory, when we want to update mempolicy and mems_allowed, we expand the set of nodes first (set all the newly nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the mempolicy's rebind functions may breaks the expanding. So we restructure the mempolicy's rebind functions and split the rebind work to two steps, just like the update of cpuset's mems: The 1st step: expand the set of the mempolicy's nodes. The 2nd step: shrink the set of the mempolicy's nodes. It is used when there is no real lock to protect the mempolicy in the read-side. Otherwise we can do rebind work at once. In order to implement it, we define enum mpol_rebind_step { MPOL_REBIND_ONCE, MPOL_REBIND_STEP1, MPOL_REBIND_STEP2, MPOL_REBIND_NSTEP, }; If the mempolicy needn't be updated by two steps, we can pass MPOL_REBIND_ONCE to the rebind functions. Or we can pass MPOL_REBIND_STEP1 to do the first step of the rebind work and pass MPOL_REBIND_STEP2 to do the second step work. Besides that, it maybe long time between these two step and we have to release the lock that protects mempolicy and mems_allowed. If we hold the lock once again, we must check whether the current mempolicy is under the rebinding (the first step has been done) or not, because the task may alloc a new mempolicy when we don't hold the lock. So we defined the following flag to identify it: #define MPOL_F_REBINDING (1 << 2) The new functions will be used in the next patch. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Paul Menage <menage@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ravikiran Thirumalai <kiran@scalex86.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andi Kleen <andi@firstfloor.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-24 21:32:07 +00:00
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
[PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:01:59 +00:00
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Michal Hocko <mhocko@suse.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Hugh Dickins <hughd@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-07-06 22:40:06 +00:00
mpol_rebind_policy(new, &mems);
[PATCH] cpuset: rebind vma mempolicies fix Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 09:01:59 +00:00
}
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return false;
if (a->mode != b->mode)
return false;
if (a->flags != b->flags)
return false;
mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: <linux-api@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-14 22:08:17 +00:00
if (a->home_node != b->home_node)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
case MPOL_PREFERRED:
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
return !!nodes_equal(a->nodes, b->nodes);
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
case MPOL_LOCAL:
return true;
default:
BUG();
return false;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
* They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
/*
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
static struct sp_node *sp_lookup(struct shared_policy *sp,
pgoff_t start, pgoff_t end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
/*
* Insert a new shared policy into the list. Caller holds sp->lock for
* writing.
*/
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
}
/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
pgoff_t idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
read_unlock(&sp->lock);
return pol;
}
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current folio node is valid in policy
*
* @folio: folio to be checked
* @vma: vm area where folio mapped
* @addr: virtual address in @vma for shared policy lookup and interleave policy
*
* Lookup current policy node id for vma,addr and "compare to" folio's
* node id. Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*
* Return: NUMA_NO_NODE if the page is in a node that is valid for this
* policy, or a suitable node ID to allocate a replacement folio from.
*/
int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pgoff_t ilx;
struct zoneref *z;
int curnid = folio_nid(folio);
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> [ixgbe] Acked-by: Jens Axboe <axboe@kernel.dk> [mtip32xx] Acked-by: Vinod Koul <vkoul@kernel.org> [dmaengine.c] Acked-by: Michael Ellerman <mpe@ellerman.id.au> [powerpc] Acked-by: Doug Ledford <dledford@redhat.com> [drivers/infiniband] Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Hans Verkuil <hverkuil@xs4all.nl> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-05 23:42:58 +00:00
int polnid = NUMA_NO_NODE;
int ret = NUMA_NO_NODE;
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Christoph Lameter <cl@linux.com> Cc: David Hildenbrand <david@redhat.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Huang Ying <ying.huang@intel.com> Cc: Kefeng Wang <wangkefeng.wang@huawei.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tejun heo <tj@kernel.org> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-10-19 20:39:08 +00:00
polnid = interleave_nid(pol, ilx);
break;
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
polnid = weighted_interleave_nid(pol, ilx);
break;
case MPOL_PREFERRED:
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
if (node_isset(curnid, pol->nodes))
goto out;
polnid = first_node(pol->nodes);
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
break;
case MPOL_LOCAL:
polnid = numa_node_id();
break;
case MPOL_BIND:
numa balancing: migrate on fault among multiple bound nodes Now, NUMA balancing can only optimize the page placement among the NUMA nodes if the default memory policy is used. Because the memory policy specified explicitly should take precedence. But this seems too strict in some situations. For example, on a system with 4 NUMA nodes, if the memory of an application is bound to the node 0 and 1, NUMA balancing can potentially migrate the pages between the node 0 and 1 to reduce cross-node accessing without breaking the explicit memory binding policy. So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to set_mempolicy() when mode is MPOL_BIND. With the flag specified, NUMA balancing will be enabled within the thread to optimize the page placement within the constrains of the specified memory binding policy. With the newly added flag, the NUMA balancing control mechanism becomes, - sysctl knob numa_balancing can enable/disable the NUMA balancing globally. - even if sysctl numa_balancing is enabled, the NUMA balancing will be disabled for the memory areas or applications with the explicit memory policy by default. - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for the applications when specifying the explicit memory policy (MPOL_BIND). Various page placement optimization based on the NUMA balancing can be done with these flags. As the first step, in this patch, if the memory of the application is bound to multiple nodes (MPOL_BIND), and in the hint page fault handler the accessing node are in the policy nodemask, the page will be tried to be migrated to the accessing node to reduce the cross-node accessing. If the newly added MPOL_F_NUMA_BALANCING flag is specified by an application on an old kernel version without its support, set_mempolicy() will return -1 and errno will be set to EINVAL. The application can use this behavior to run on both old and new kernel versions. And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL as before. Because we don't support optimization based on the NUMA balancing for these modes. In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for mbind(). But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good API/ABI for the purpose of the patch. And because it's not clear whether it's necessary to enable NUMA balancing for a specific memory area inside an application, so we only add the flag at the thread level (set_mempolicy()) instead of the memory area level (mbind()). We can do that when it become necessary. To test the patch, we run a test case as follows on a 4-node machine with 192 GB memory (48 GB per node). 1. Change pmbench memory accessing benchmark to call set_mempolicy() to bind its memory to node 1 and 3 and enable NUMA balancing. Some related code snippets are as follows, #include <numaif.h> #include <numa.h> struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1,3"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); /* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */ if (ret < 0 && errno == EINVAL) ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } 2. Run a memory eater on node 3 to use 40 GB memory before running pmbench. 3. Run pmbench with 64 processes, the working-set size of each process is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB. The CPU and the memory (as in step 1.) of all pmbench processes is bound to node 1 and 3. So, after CPU usage is balanced, some pmbench processes run on the CPUs of the node 3 will access the memory of the node 1. 4. After the pmbench processes run for 100 seconds, kill the memory eater. Now it's possible for some pmbench processes to migrate their pages from node 1 to node 3 to reduce cross-node accessing. Test results show that, with the patch, the pages can be migrated from node 1 to node 3 after killing the memory eater, and the pmbench score can increase about 17.5%. Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Rik van Riel <riel@surriel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-02-24 20:09:43 +00:00
/* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) {
if (node_isset(thisnid, pol->nodes))
numa balancing: migrate on fault among multiple bound nodes Now, NUMA balancing can only optimize the page placement among the NUMA nodes if the default memory policy is used. Because the memory policy specified explicitly should take precedence. But this seems too strict in some situations. For example, on a system with 4 NUMA nodes, if the memory of an application is bound to the node 0 and 1, NUMA balancing can potentially migrate the pages between the node 0 and 1 to reduce cross-node accessing without breaking the explicit memory binding policy. So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to set_mempolicy() when mode is MPOL_BIND. With the flag specified, NUMA balancing will be enabled within the thread to optimize the page placement within the constrains of the specified memory binding policy. With the newly added flag, the NUMA balancing control mechanism becomes, - sysctl knob numa_balancing can enable/disable the NUMA balancing globally. - even if sysctl numa_balancing is enabled, the NUMA balancing will be disabled for the memory areas or applications with the explicit memory policy by default. - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for the applications when specifying the explicit memory policy (MPOL_BIND). Various page placement optimization based on the NUMA balancing can be done with these flags. As the first step, in this patch, if the memory of the application is bound to multiple nodes (MPOL_BIND), and in the hint page fault handler the accessing node are in the policy nodemask, the page will be tried to be migrated to the accessing node to reduce the cross-node accessing. If the newly added MPOL_F_NUMA_BALANCING flag is specified by an application on an old kernel version without its support, set_mempolicy() will return -1 and errno will be set to EINVAL. The application can use this behavior to run on both old and new kernel versions. And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL as before. Because we don't support optimization based on the NUMA balancing for these modes. In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for mbind(). But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good API/ABI for the purpose of the patch. And because it's not clear whether it's necessary to enable NUMA balancing for a specific memory area inside an application, so we only add the flag at the thread level (set_mempolicy()) instead of the memory area level (mbind()). We can do that when it become necessary. To test the patch, we run a test case as follows on a 4-node machine with 192 GB memory (48 GB per node). 1. Change pmbench memory accessing benchmark to call set_mempolicy() to bind its memory to node 1 and 3 and enable NUMA balancing. Some related code snippets are as follows, #include <numaif.h> #include <numa.h> struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1,3"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); /* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */ if (ret < 0 && errno == EINVAL) ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } 2. Run a memory eater on node 3 to use 40 GB memory before running pmbench. 3. Run pmbench with 64 processes, the working-set size of each process is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB. The CPU and the memory (as in step 1.) of all pmbench processes is bound to node 1 and 3. So, after CPU usage is balanced, some pmbench processes run on the CPUs of the node 3 will access the memory of the node 1. 4. After the pmbench processes run for 100 seconds, kill the memory eater. Now it's possible for some pmbench processes to migrate their pages from node 1 to node 3 to reduce cross-node accessing. Test results show that, with the patch, the pages can be migrated from node 1 to node 3 after killing the memory eater, and the pmbench score can increase about 17.5%. Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Rik van Riel <riel@surriel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-02-24 20:09:43 +00:00
break;
goto out;
}
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
fallthrough;
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
/*
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
polnid = zone_to_nid(z->zone);
break;
default:
BUG();
}
/* Migrate the folio towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, folio, curnid,
thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
/*
* Drop the (possibly final) reference to task->mempolicy. It needs to be
* dropped after task->mempolicy is set to NULL so that any allocation done as
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
* policy.
*/
void mpol_put_task_policy(struct task_struct *task)
{
struct mempolicy *pol;
task_lock(task);
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
struct sp_node *n;
struct mempolicy *newpol;
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
sp_node_init(n, start, end, newpol);
mempolicy: remove mempolicy sharing Dave Jones' system call fuzz testing tool "trinity" triggered the following bug error with slab debugging enabled ============================================================================= BUG numa_policy (Not tainted): Poison overwritten ----------------------------------------------------------------------------- INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154 __slab_alloc+0x3d3/0x445 kmem_cache_alloc+0x29d/0x2b0 mpol_new+0xa3/0x140 sys_mbind+0x142/0x620 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154 __slab_free+0x2e/0x1de kmem_cache_free+0x25a/0x260 __mpol_put+0x27/0x30 remove_vma+0x68/0x90 exit_mmap+0x118/0x140 mmput+0x73/0x110 exit_mm+0x108/0x130 do_exit+0x162/0xb90 do_group_exit+0x4f/0xc0 sys_exit_group+0x17/0x20 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080 INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0 The problem is that the structure is being prematurely freed due to a reference count imbalance. In the following case mbind(addr, len) should replace the memory policies of both vma1 and vma2 and thus they will become to share the same mempolicy and the new mempolicy will have the MPOL_F_SHARED flag. +-------------------+-------------------+ | vma1 | vma2(shmem) | +-------------------+-------------------+ | | addr addr+len alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for maintaining the mempolicy reference count. The current rule is that get_vma_policy() only increments refcount for shmem VMA and mpol_conf_put() only decrements refcount if the policy has MPOL_F_SHARED. In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED! The reference count will be decreased even though was not increased whenever alloc_page_vma() is called. This has been broken since commit [52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008. There is another serious bug with the sharing of memory policies. Currently, mempolicy rebind logic (it is called from cpuset rebinding) ignores a refcount of mempolicy and override it forcibly. Thus, any mempolicy sharing may cause mempolicy corruption. The bug was introduced by commit [68860ec1: cpusets: automatic numa mempolicy rebinding]. Ideally, the shared policy handling would be rewritten to either properly handle COW of the policy structures or at least reference count MPOL_F_SHARED based exclusively on information within the policy. However, this patch takes the easier approach of disabling any policy sharing between VMAs. Each new range allocated with sp_alloc will allocate a new policy, set the reference count to 1 and drop the reference count of the old policy. This increases the memory footprint but is not expected to be a major problem as mbind() is unlikely to be used for fine-grained ranges. It is also inefficient because it means we allocate a new policy even in cases where mbind_range() could use the new_policy passed to it. However, it is more straight-forward and the change should be invisible to the user. [mgorman@suse.de: Edited changelog] Reported-by: Dave Jones <davej@redhat.com>, Cc: Christoph Lameter <cl@linux.com>, Reviewed-by: Christoph Lameter <cl@linux.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Josh Boyer <jwboyer@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 23:29:16 +00:00
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
pgoff_t end, struct sp_node *new)
{
struct sp_node *n;
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
restart:
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
write_unlock(&sp->lock);
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
alloc_new:
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
write_unlock(&sp->lock);
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
atomic_set(&mpol_new->refcnt, 1);
mm: mempolicy: Convert shared_policy mutex to spinlock Sasha was fuzzing with trinity and reported the following problem: BUG: sleeping function called from invalid context at kernel/mutex.c:269 in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main 2 locks held by trinity-main/6361: #0: (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0 #1: (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0 Pid: 6361, comm: trinity-main Tainted: G W 3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74 Call Trace: __might_sleep+0x1c3/0x1e0 mutex_lock_nested+0x29/0x50 mpol_shared_policy_lookup+0x2e/0x90 shmem_get_policy+0x2e/0x30 get_vma_policy+0x5a/0xa0 mpol_misplaced+0x41/0x1d0 handle_pte_fault+0x465/0x6a0 This was triggered by a different version of automatic NUMA balancing but in theory the current version is vunerable to the same problem. do_numa_page -> numa_migrate_prep -> mpol_misplaced -> get_vma_policy -> shmem_get_policy It's very unlikely this will happen as shared pages are not marked pte_numa -- see the page_mapcount() check in change_pte_range() -- but it is possible. To address this, this patch restores sp->lock as originally implemented by Kosaki Motohiro. In the path where get_vma_policy() is called, it should not be calling sp_alloc() so it is not necessary to treat the PTL specially. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-12-21 23:10:25 +00:00
goto restart;
}
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
* This is called at get_inode() calls and we can use GFP_KERNEL.
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
int ret;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
sp->root = RB_ROOT; /* empty tree == default mempolicy */
mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer <nzimmer@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Nadia Yvette Chambers <nyc@holomorphy.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-01-14 23:18:36 +00:00
rwlock_init(&sp->lock);
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
if (mpol) {
struct sp_node *sn;
struct mempolicy *npol;
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
NODEMASK_SCRATCH(scratch);
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy to this file */
npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(npol))
goto free_scratch; /* no valid nodemask intersection */
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
task_lock(current);
ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-16 22:31:49 +00:00
task_unlock(current);
if (ret)
goto put_npol;
/* alloc node covering entire file; adds ref to file's npol */
sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
if (sn)
sp_insert(sp, sn);
put_npol:
mpol_put(npol); /* drop initial ref on file's npol */
free_scratch:
mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-08-06 22:07:33 +00:00
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
2023-10-03 09:20:14 +00:00
int mpol_set_shared_policy(struct shared_policy *sp,
struct vm_area_struct *vma, struct mempolicy *pol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
2023-10-03 09:20:14 +00:00
if (pol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
if (!new)
return -ENOMEM;
}
2023-10-03 09:20:14 +00:00
err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
2023-10-03 09:20:14 +00:00
void mpol_free_shared_policy(struct shared_policy *sp)
{
struct sp_node *n;
struct rb_node *next;
2023-10-03 09:20:14 +00:00
if (!sp->root.rb_node)
return;
2023-10-03 09:20:14 +00:00
write_lock(&sp->lock);
next = rb_first(&sp->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
2023-10-03 09:20:14 +00:00
sp_delete(sp, n);
}
2023-10-03 09:20:14 +00:00
write_unlock(&sp->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.nodes = nodemask_of_node(nid),
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
mempolicy: support optional mode flags With the evolution of mempolicies, it is necessary to support mempolicy mode flags that specify how the policy shall behave in certain circumstances. The most immediate need for mode flag support is to suppress remapping the nodemask of a policy at the time of rebind. Both the mempolicy mode and flags are passed by the user in the 'int policy' formal of either the set_mempolicy() or mbind() syscall. A new constant, MPOL_MODE_FLAGS, represents the union of legal optional flags that may be passed as part of this int. Mempolicies that include illegal flags as part of their policy are rejected as invalid. An additional member to struct mempolicy is added to support the mode flags: struct mempolicy { ... unsigned short policy; unsigned short flags; } The splitting of the 'int' actual passed by the user is done in sys_set_mempolicy() and sys_mbind() for their respective syscalls. This is done by intersecting the actual with MPOL_MODE_FLAGS, rejecting the syscall of there are additional flags, and storing it in the new 'flags' member of struct mempolicy. The intersection of the actual with ~MPOL_MODE_FLAGS is stored in the 'policy' member of the struct and all current users of pol->policy remain unchanged. The union of the policy mode and optional mode flags is passed back to the user in get_mempolicy(). This combination of mode and flags within the same actual does not break userspace code that relies on get_mempolicy(&policy, ...) and either switch (policy) { case MPOL_BIND: ... case MPOL_INTERLEAVE: ... }; statements or if (policy == MPOL_INTERLEAVE) { ... } statements. Such applications would need to use optional mode flags when calling set_mempolicy() or mbind() for these previously implemented statements to stop working. If an application does start using optional mode flags, it will need to mask the optional flags off the policy in switch and conditional statements that only test mode. An additional member is also added to struct shmem_sb_info to store the optional mode flags. [hugh@veritas.com: shmem mpol: fix build warning] Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:25 +00:00
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
[PATCH] cpusets: automatic numa mempolicy rebinding This patch automatically updates a tasks NUMA mempolicy when its cpuset memory placement changes. It does so within the context of the task, without any need to support low level external mempolicy manipulation. If a system is not using cpusets, or if running on a system with just the root (all-encompassing) cpuset, then this remap is a no-op. Only when a task is moved between cpusets, or a cpusets memory placement is changed does the following apply. Otherwise, the main routine below, rebind_policy() is not even called. When mixing cpusets, scheduler affinity, and NUMA mempolicies, the essential role of cpusets is to place jobs (several related tasks) on a set of CPUs and Memory Nodes, the essential role of sched_setaffinity is to manage a jobs processor placement within its allowed cpuset, and the essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs memory placement within its allowed cpuset. However, CPU affinity and NUMA memory placement are managed within the kernel using absolute system wide numbering, not cpuset relative numbering. This is ok until a job is migrated to a different cpuset, or what's the same, a jobs cpuset is moved to different CPUs and Memory Nodes. Then the CPU affinity and NUMA memory placement of the tasks in the job need to be updated, to preserve their cpuset-relative position. This can be done for CPU affinity using sched_setaffinity() from user code, as one task can modify anothers CPU affinity. This cannot be done from an external task for NUMA memory placement, as that can only be modified in the context of the task using it. However, it easy enough to remap a tasks NUMA mempolicy automatically when a task is migrated, using the existing cpuset mechanism to trigger a refresh of a tasks memory placement after its cpuset has changed. All that is needed is the old and new nodemask, and notice to the task that it needs to rebind its mempolicy. The tasks mems_allowed has the old mask, the tasks cpuset has the new mask, and the existing cpuset_update_current_mems_allowed() mechanism provides the notice. The bitmap/cpumask/nodemask remap operators provide the cpuset relative calculations. This patch leaves open a couple of issues: 1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies: These mempolicies may reference nodes outside of those allowed to the current task by its cpuset. Tasks are migrated as part of jobs, which reside on what might be several cpusets in a subtree. When such a job is migrated, all NUMA memory policy references to nodes within that cpuset subtree should be translated, and references to any nodes outside that subtree should be left untouched. A future patch will provide the cpuset mechanism needed to mark such subtrees. With that patch, we will be able to correctly migrate these other memory policies across a job migration. 2) Updating cpuset, affinity and memory policies in user space: This is harder. Any placement state stored in user space using system-wide numbering will be invalidated across a migration. More work will be required to provide user code with a migration-safe means to manage its cpuset relative placement, while preserving the current API's that pass system wide numbers, not cpuset relative numbers across the kernel-user boundary. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 23:02:36 +00:00
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
/*
* Parse and format mempolicy from/to strings
*/
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
[MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
[MPOL_LOCAL] = "local",
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
[MPOL_PREFERRED_MANY] = "prefer (many)",
};
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
#ifdef CONFIG_TMPFS
/**
tmpfs mempolicy: fix /proc/mounts corrupting memory Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA mempolicy testing. Very nasty. Reading /proc/mounts, /proc/pid/mounts or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often in a page table (causing "Bad swap" or "Bad page map" warning or "Bad pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere worse. "mpol=prefer" and "mpol=prefer:Node" are equally toxic. Recent NUMA enhancements are not to blame: this dates back to 2.6.35, when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(), which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags. With slab poisoning, you can then rely on mpol_to_str() to set the bit for node 0x6b6b, probably in the next page above the caller's stack. mpol_parse_str() is only called from shmem_parse_options(): no_context is always true, so call it unused for now, and remove !no_context code. Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might expect. Then mpol_to_str() can ignore its no_context argument also, the mpol being appropriately initialized whether contextualized or not. Rename its no_context unused too, and let subsequent patch remove them (that's not needed for stable backporting, which would involve rejects). I don't understand why MPOL_LOCAL is described as a pseudo-policy: it's a reasonable policy which suffers from a confusing implementation in terms of MPOL_PREFERRED with MPOL_F_LOCAL. I believe this would be much more robust if MPOL_LOCAL were recognized in switch statements throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly empty) nodes mask like everyone else, instead of its preferred_node variant (I presume an optimization from the days before MPOL_LOCAL). But that would take me too long to get right and fully tested. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-01-02 10:01:33 +00:00
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
* @str: string containing mempolicy to parse
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
* @mpol: pointer to struct mempolicy pointer, returned on success.
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* Return: %0 on success, else %1
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
{
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
struct mempolicy *new = NULL;
tmpfs mempolicy: fix /proc/mounts corrupting memory Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA mempolicy testing. Very nasty. Reading /proc/mounts, /proc/pid/mounts or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often in a page table (causing "Bad swap" or "Bad page map" warning or "Bad pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere worse. "mpol=prefer" and "mpol=prefer:Node" are equally toxic. Recent NUMA enhancements are not to blame: this dates back to 2.6.35, when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(), which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags. With slab poisoning, you can then rely on mpol_to_str() to set the bit for node 0x6b6b, probably in the next page above the caller's stack. mpol_parse_str() is only called from shmem_parse_options(): no_context is always true, so call it unused for now, and remove !no_context code. Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might expect. Then mpol_to_str() can ignore its no_context argument also, the mpol being appropriately initialized whether contextualized or not. Rename its no_context unused too, and let subsequent patch remove them (that's not needed for stable backporting, which would involve rejects). I don't understand why MPOL_LOCAL is described as a pseudo-policy: it's a reasonable policy which suffers from a confusing implementation in terms of MPOL_PREFERRED with MPOL_F_LOCAL. I believe this would be much more robust if MPOL_LOCAL were recognized in switch statements throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly empty) nodes mask like everyone else, instead of its preferred_node variant (I presume an optimization from the days before MPOL_LOCAL). But that would take me too long to get right and fully tested. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-01-02 10:01:33 +00:00
unsigned short mode_flags;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
nodemask_t nodes;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1, mode;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
if (flags)
*flags++ = '\0'; /* terminate mode string */
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
if (nodelist_parse(nodelist, nodes))
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
goto out;
if (!nodes_subset(nodes, node_states[N_MEMORY]))
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
goto out;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
} else
nodes_clear(nodes);
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
goto out;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
switch (mode) {
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
case MPOL_PREFERRED:
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
/*
* Insist on a nodelist of one node only, although later
* we use first_node(nodes) to grab a single node, so here
* nodelist (or nodes) cannot be empty.
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
*/
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
if (*rest)
goto out;
if (nodes_empty(nodes))
goto out;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
}
break;
case MPOL_INTERLEAVE:
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
break;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
case MPOL_LOCAL:
/*
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
* Don't allow a nodelist; mpol_new() checks flags
*/
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
if (nodelist)
goto out;
break;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
}
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
mode_flags = 0;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
mode_flags |= MPOL_F_STATIC_NODES;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
else if (!strcmp(flags, "relative"))
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
mode_flags |= MPOL_F_RELATIVE_NODES;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
else
goto out;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
}
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
goto out;
tmpfs mempolicy: fix /proc/mounts corrupting memory Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA mempolicy testing. Very nasty. Reading /proc/mounts, /proc/pid/mounts or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often in a page table (causing "Bad swap" or "Bad page map" warning or "Bad pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere worse. "mpol=prefer" and "mpol=prefer:Node" are equally toxic. Recent NUMA enhancements are not to blame: this dates back to 2.6.35, when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(), which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags. With slab poisoning, you can then rely on mpol_to_str() to set the bit for node 0x6b6b, probably in the next page above the caller's stack. mpol_parse_str() is only called from shmem_parse_options(): no_context is always true, so call it unused for now, and remove !no_context code. Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might expect. Then mpol_to_str() can ignore its no_context argument also, the mpol being appropriately initialized whether contextualized or not. Rename its no_context unused too, and let subsequent patch remove them (that's not needed for stable backporting, which would involve rejects). I don't understand why MPOL_LOCAL is described as a pseudo-policy: it's a reasonable policy which suffers from a confusing implementation in terms of MPOL_PREFERRED with MPOL_F_LOCAL. I believe this would be much more robust if MPOL_LOCAL were recognized in switch statements throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly empty) nodes mask like everyone else, instead of its preferred_node variant (I presume an optimization from the days before MPOL_LOCAL). But that would take me too long to get right and fully tested. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-01-02 10:01:33 +00:00
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED) {
new->nodes = nodes;
} else if (nodelist) {
nodes_clear(new->nodes);
node_set(first_node(nodes), new->nodes);
} else {
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
new->mode = MPOL_LOCAL;
}
tmpfs mempolicy: fix /proc/mounts corrupting memory Recently I suggested using "mount -o remount,mpol=local /tmp" in NUMA mempolicy testing. Very nasty. Reading /proc/mounts, /proc/pid/mounts or /proc/pid/mountinfo may then corrupt one bit of kernel memory, often in a page table (causing "Bad swap" or "Bad page map" warning or "Bad pagetable" oops), sometimes in a vm_area_struct or rbnode or somewhere worse. "mpol=prefer" and "mpol=prefer:Node" are equally toxic. Recent NUMA enhancements are not to blame: this dates back to 2.6.35, when commit e17f74af351c "mempolicy: don't call mpol_set_nodemask() when no_context" skipped mpol_parse_str()'s call to mpol_set_nodemask(), which used to initialize v.preferred_node, or set MPOL_F_LOCAL in flags. With slab poisoning, you can then rely on mpol_to_str() to set the bit for node 0x6b6b, probably in the next page above the caller's stack. mpol_parse_str() is only called from shmem_parse_options(): no_context is always true, so call it unused for now, and remove !no_context code. Set v.nodes or v.preferred_node or MPOL_F_LOCAL as mpol_to_str() might expect. Then mpol_to_str() can ignore its no_context argument also, the mpol being appropriately initialized whether contextualized or not. Rename its no_context unused too, and let subsequent patch remove them (that's not needed for stable backporting, which would involve rejects). I don't understand why MPOL_LOCAL is described as a pseudo-policy: it's a reasonable policy which suffers from a confusing implementation in terms of MPOL_PREFERRED with MPOL_F_LOCAL. I believe this would be much more robust if MPOL_LOCAL were recognized in switch statements throughout, MPOL_F_LOCAL deleted, and MPOL_PREFERRED use the (possibly empty) nodes mask like everyone else, instead of its preferred_node variant (I presume an optimization from the days before MPOL_LOCAL). But that would take me too long to get right and fully tested. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-01-02 10:01:33 +00:00
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
if (!err)
*mpol = new;
mempolicy: rework shmem mpol parsing and display mm/shmem.c currently contains functions to parse and display memory policy strings for the tmpfs 'mpol' mount option. Move this to mm/mempolicy.c with the rest of the mempolicy support. With subsequent patches, we'll be able to remove knowledge of the details [mode, flags, policy, ...] completely from shmem.c 1) replace shmem_parse_mpol() in mm/shmem.c with mpol_parse_str() in mm/mempolicy.c. Rework to use the policy_types[] array [used by mpol_to_str()] to look up mode by name. 2) use mpol_to_str() to format policy for shmem_show_mpol(). mpol_to_str() expects a pointer to a struct mempolicy, so temporarily construct one. This will be replaced with a reference to a struct mempolicy in the tmpfs superblock in a subsequent patch. NOTE 1: I changed mpol_to_str() to use a colon ':' rather than an equal sign '=' as the nodemask delimiter to match mpol_parse_str() and the tmpfs/shmem mpol mount option formatting that now uses mpol_to_str(). This is a user visible change to numa_maps, but then the addition of the mode flags already changed the display. It makes sense to me to have the mounts and numa_maps display the policy in the same format. However, if anyone objects strongly, I can pass the desired nodemask delimeter as an arg to mpol_to_str(). Note 2: Like show_numa_map(), I don't check the return code from mpol_to_str(). I do use a longer buffer than the one provided by show_numa_map(), which seems to have sufficed so far. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:23 +00:00
return err;
}
#endif /* CONFIG_TMPFS */
mempolicy: use struct mempolicy pointer in shmem_sb_info This patch replaces the mempolicy mode, mode_flags, and nodemask in the shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL. This removes dependency on the details of mempolicy from shmem.c and hugetlbfs inode.c and simplifies the interfaces. mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a pointer arg, a struct mempolicy pointer on success. For MPOL_DEFAULT, the returned pointer is NULL. Further, mpol_parse_str() now takes a 'no_context' argument that causes the input nodemask to be stored in the w.user_nodemask of the created mempolicy for use when the mempolicy is installed in a tmpfs inode shared policy tree. At that time, any cpuset contextualization is applied to the original input nodemask. This preserves the previous behavior where the input nodemask was stored in the superblock. We can think of the returned mempolicy as "context free". Because mpol_parse_str() is now calling mpol_new(), we can remove from mpol_to_str() the semantic checks that mpol_new() already performs. Add 'no_context' parameter to mpol_to_str() to specify that it should format the nodemask in w.user_nodemask for 'bind' and 'interleave' policies. Change mpol_shared_policy_init() to take a pointer to a "context free" struct mempolicy and to create a new, "contextualized" mempolicy using the mode, mode_flags and user_nodemask from the input mempolicy. Note: we know that the mempolicy passed to mpol_to_str() or mpol_shared_policy_init() from a tmpfs superblock is "context free". This is currently the only instance thereof. However, if we found more uses for this concept, and introduced any ambiguity as to whether a mempolicy was context free or not, we could add another internal mode flag to identify context free mempolicies. Then, we could remove the 'no_context' argument from mpol_to_str(). Added shmem_get_sbmpol() to return a reference counted superblock mempolicy, if one exists, to pass to mpol_shared_policy_init(). We must add the reference under the sb stat_lock to prevent races with replacement of the mpol by remount. This reference is removed in mpol_shared_policy_init(). [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: another build fix] [akpm@linux-foundation.org: yet another build fix] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:26 +00:00
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
mempolicy: clean-up mpol-to-str() mempolicy formatting mpol-to-str() formats memory policies into printable strings. Currently this is only used to display "numa_maps". A subsequent patch will use mpol_to_str() for formatting tmpfs [shmem] mpol mount options, allowing us to remove essentially duplicate code in mm/shmem.c. This patch cleans up mpol_to_str() generally and in preparation for that patch. 1) show_numa_maps() is not checking the return code from mpol_to_str(). There's not a lot we can do in this context if mpol_to_str() did return the error [insufficient space in buffer]. Proposed "solution": just check, under DEBUG_VM, that callers are providing sufficient buffer space for the policy, flags, and a few nodes. This way, we'll get some display. show_numa_maps() is providing a 50-byte buffer, so it won't trip this check. 50-bytes should be sufficient unless one has a large number of nodes in a very sparse nodemask. 2) The display of the new mode flags ["static" & "relative"] was set up to display multiple flags, separated by a "bar" '|'. However, this support is incomplete--e.g., need_bar was never incremented; and currently, these two flags are mutually exclusive. So remove the "bar" support, for now, and only display one flag. 3) Use snprint() to format flags, so as not to overflow the buffer. Not that it's ever happed, AFAIK. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:22 +00:00
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
mode = pol->mode;
flags = pol->flags;
}
mempolicy: use MPOL_PREFERRED for system-wide default policy Currently, when one specifies MPOL_DEFAULT via a NUMA memory policy API [set_mempolicy(), mbind() and internal versions], the kernel simply installs a NULL struct mempolicy pointer in the appropriate context: task policy, vma policy, or shared policy. This causes any use of that policy to "fall back" to the next most specific policy scope. The only use of MPOL_DEFAULT to mean "local allocation" is in the system default policy. This requires extra checks/cases for MPOL_DEFAULT in many mempolicy.c functions. There is another, "preferred" way to specify local allocation via the APIs. That is using the MPOL_PREFERRED policy mode with an empty nodemask. Internally, the empty nodemask gets converted to a preferred_node id of '-1'. All internal usage of MPOL_PREFERRED will convert the '-1' to the id of the node local to the cpu where the allocation occurs. System default policy, except during boot, is hard-coded to "local allocation". By using the MPOL_PREFERRED mode with a negative value of preferred node for system default policy, MPOL_DEFAULT will never occur in the 'policy' member of a struct mempolicy. Thus, we can remove all checks for MPOL_DEFAULT when converting policy to a node id/zonelist in the allocation paths. In slab_node() return local node id when policy pointer is NULL. No need to set a pol value to take the switch default. Replace switch default with BUG()--i.e., shouldn't happen. With this patch MPOL_DEFAULT is only used in the APIs, including internal calls to do_set_mempolicy() and in the display of policy in /proc/<pid>/numa_maps. It always means "fall back" to the the next most specific policy scope. This simplifies the description of memory policies quite a bit, with no visible change in behavior. get_mempolicy() continues to return MPOL_DEFAULT and an empty nodemask when the requested policy [task or vma/shared] is NULL. These are the values one would supply via set_mempolicy() or mbind() to achieve that condition--default behavior. This patch updates Documentation to reflect this change. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:18 +00:00
switch (mode) {
case MPOL_DEFAULT:
mm/mempolicy: don't handle MPOL_LOCAL like a fake MPOL_PREFERRED policy MPOL_LOCAL policy has been setup as a real policy, but it is still handled like a faked POL_PREFERRED policy with one internal MPOL_F_LOCAL flag bit set, and there are many places having to judge the real 'prefer' or the 'local' policy, which are quite confusing. In current code, there are 4 cases that MPOL_LOCAL are used: 1. user specifies 'local' policy 2. user specifies 'prefer' policy, but with empty nodemask 3. system 'default' policy is used 4. 'prefer' policy + valid 'preferred' node with MPOL_F_STATIC_NODES flag set, and when it is 'rebind' to a nodemask which doesn't contains the 'preferred' node, it will perform as 'local' policy So make 'local' a real policy instead of a fake 'prefer' one, and kill MPOL_F_LOCAL bit, which can greatly reduce the confusion for code reading. For case 4, the logic of mpol_rebind_preferred() is confusing, as Michal Hocko pointed out: : I do believe that rebinding preferred policy is just bogus and it should : be dropped altogether on the ground that a preference is a mere hint from : userspace where to start the allocation. Unless I am missing something : cpusets will be always authoritative for the final placement. The : preferred node just acts as a starting point and it should be really : preserved when cpusets changes. Otherwise we have a very subtle behavior : corner cases. So dump all the tricky transformation between 'prefer' and 'local', and just record the new nodemask of rebinding. [feng.tang@intel.com: fix a problem in mpol_set_nodemask(), per Michal Hocko] Link: https://lkml.kernel.org/r/1622560492-1294-3-git-send-email-feng.tang@intel.com [feng.tang@intel.com: refine code and comments of mpol_set_nodemask(), per Michal] Link: https://lkml.kernel.org/r/20210603081807.GE56979@shbuild999.sh.intel.com Link: https://lkml.kernel.org/r/1622469956-82897-3-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang <feng.tang@intel.com> Suggested-by: Michal Hocko <mhocko@suse.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Ben Widawsky <ben.widawsky@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-07-01 01:51:00 +00:00
case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Ben Widawsky <ben.widawsky@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Feng Tang <feng.tang@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andi Kleen <ak@linux.intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Huang Ying <ying.huang@intel.com>b Cc: Michal Hocko <mhocko@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-09-02 22:00:06 +00:00
case MPOL_PREFERRED_MANY:
case MPOL_BIND:
case MPOL_INTERLEAVE:
mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for weighted interleaving When a system has multiple NUMA nodes and it becomes bandwidth hungry, using the current MPOL_INTERLEAVE could be an wise option. However, if those NUMA nodes consist of different types of memory such as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin based interleave policy does not optimally distribute data to make use of their different bandwidth characteristics. Instead, interleave is more effective when the allocation policy follows each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution. This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE, enabling weighted interleave between NUMA nodes. Weighted interleave allows for proportional distribution of memory across multiple numa nodes, preferably apportioned to match the bandwidth of each node. For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1), with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate weight distribution is (2:1). Weights for each node can be assigned via the new sysfs extension: /sys/kernel/mm/mempolicy/weighted_interleave/ For now, the default value of all nodes will be `1`, which matches the behavior of standard 1:1 round-robin interleave. An extension will be added in the future to allow default values to be registered at kernel and device bringup time. The policy allocates a number of pages equal to the set weights. For example, if the weights are (2,1), then 2 pages will be allocated on node0 for every 1 page allocated on node1. The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2) and mbind(2). Some high level notes about the pieces of weighted interleave: current->il_prev: Tracks the node previously allocated from. current->il_weight: The active weight of the current node (current->il_prev) When this reaches 0, current->il_prev is set to the next node and current->il_weight is set to the next weight. weighted_interleave_nodes: Counts the number of allocations as they occur, and applies the weight for the current node. When the weight reaches 0, switch to the next node. Operates only on task->mempolicy. weighted_interleave_nid: Gets the total weight of the nodemask as well as each individual node weight, then calculates the node based on the given index. Operates on VMA policies. bulk_array_weighted_interleave: Gets the total weight of the nodemask as well as each individual node weight, then calculates the number of "interleave rounds" as well as any delta ("partial round"). Calculates the number of pages for each node and allocates them. If a node was scheduled for interleave via interleave_nodes, the current weight will be allocated first. Operates only on the task->mempolicy. One piece of complexity is the interaction between a recent refactor which split the logic to acquire the "ilx" (interleave index) of an allocation and the actually application of the interleave. If a call to alloc_pages_mpol() were made with a weighted-interleave policy and ilx set to NO_INTERLEAVE_INDEX, weighted_interleave_nodes() would operate on a VMA policy - violating the description above. An inspection of all callers of alloc_pages_mpol() shows that all external callers set ilx to `0`, an index value, or will call get_vma_policy() to acquire the ilx. For example, mm/shmem.c may call into alloc_pages_mpol. The call stacks all set (pgoff_t ilx) or end up in `get_vma_policy()`. This enforces the `weighted_interleave_nodes()` and `weighted_interleave_nid()` policy requirements (task/vma respectively). Link: https://lkml.kernel.org/r/20240202170238.90004-4-gregory.price@memverge.com Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Co-developed-by: Honggyu Kim <honggyu.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:37 +00:00
case MPOL_WEIGHTED_INTERLEAVE:
nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
mempolicy: clean-up mpol-to-str() mempolicy formatting mpol-to-str() formats memory policies into printable strings. Currently this is only used to display "numa_maps". A subsequent patch will use mpol_to_str() for formatting tmpfs [shmem] mpol mount options, allowing us to remove essentially duplicate code in mm/shmem.c. This patch cleans up mpol_to_str() generally and in preparation for that patch. 1) show_numa_maps() is not checking the return code from mpol_to_str(). There's not a lot we can do in this context if mpol_to_str() did return the error [insufficient space in buffer]. Proposed "solution": just check, under DEBUG_VM, that callers are providing sufficient buffer space for the policy, flags, and a few nodes. This way, we'll get some display. show_numa_maps() is providing a 50-byte buffer, so it won't trip this check. 50-bytes should be sufficient unless one has a large number of nodes in a very sparse nodemask. 2) The display of the new mode flags ["static" & "relative"] was set up to display multiple flags, separated by a "bar" '|'. However, this support is incomplete--e.g., need_bar was never incremented; and currently, these two flags are mutually exclusive. So remove the "bar" support, for now, and only display one flag. 3) Use snprint() to format flags, so as not to overflow the buffer. Not that it's ever happed, AFAIK. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:22 +00:00
/*
* Currently, the only defined flags are mutually exclusive
*/
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
if (flags & MPOL_F_STATIC_NODES)
mempolicy: clean-up mpol-to-str() mempolicy formatting mpol-to-str() formats memory policies into printable strings. Currently this is only used to display "numa_maps". A subsequent patch will use mpol_to_str() for formatting tmpfs [shmem] mpol mount options, allowing us to remove essentially duplicate code in mm/shmem.c. This patch cleans up mpol_to_str() generally and in preparation for that patch. 1) show_numa_maps() is not checking the return code from mpol_to_str(). There's not a lot we can do in this context if mpol_to_str() did return the error [insufficient space in buffer]. Proposed "solution": just check, under DEBUG_VM, that callers are providing sufficient buffer space for the policy, flags, and a few nodes. This way, we'll get some display. show_numa_maps() is providing a 50-byte buffer, so it won't trip this check. 50-bytes should be sufficient unless one has a large number of nodes in a very sparse nodemask. 2) The display of the new mode flags ["static" & "relative"] was set up to display multiple flags, separated by a "bar" '|'. However, this support is incomplete--e.g., need_bar was never incremented; and currently, these two flags are mutually exclusive. So remove the "bar" support, for now, and only display one flag. 3) Use snprint() to format flags, so as not to overflow the buffer. Not that it's ever happed, AFAIK. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:13:22 +00:00
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
mempolicy: add MPOL_F_STATIC_NODES flag Add an optional mempolicy mode flag, MPOL_F_STATIC_NODES, that suppresses the node remap when the policy is rebound. Adds another member to struct mempolicy, nodemask_t user_nodemask, as part of a union with cpuset_mems_allowed: struct mempolicy { ... union { nodemask_t cpuset_mems_allowed; nodemask_t user_nodemask; } w; } that stores the the nodemask that the user passed when he or she created the mempolicy via set_mempolicy() or mbind(). When using MPOL_F_STATIC_NODES, which is passed with any mempolicy mode, the user's passed nodemask intersected with the VMA or task's allowed nodes is always used when determining the preferred node, setting the MPOL_BIND zonelist, or creating the interleave nodemask. This happens whenever the policy is rebound, including when a task's cpuset assignment changes or the cpuset's mems are changed. This creates an interesting side-effect in that it allows the mempolicy "intent" to lie dormant and uneffected until it has access to the node(s) that it desires. For example, if you currently ask for an interleaved policy over a set of nodes that you do not have access to, the mempolicy is not created and the task continues to use the previous policy. With this change, however, it is possible to create the same mempolicy; it is only effected when access to nodes in the nodemask is acquired. It is also possible to mount tmpfs with the static nodemask behavior when specifying a node or nodemask. To do this, simply add "=static" immediately following the mempolicy mode at mount time: mount -o remount mpol=interleave=static:1-3 Also removes mpol_check_policy() and folds its logic into mpol_new() since it is now obsoleted. The unused vma_mpol_equal() is also removed. Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 09:12:27 +00:00
}
if (!nodes_empty(nodes))
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
mm/mempolicy: implement the sysfs-based weighted_interleave interface Patch series "mm/mempolicy: weighted interleave mempolicy and sysfs extension", v5. Weighted interleave is a new interleave policy intended to make use of heterogeneous memory environments appearing with CXL. The existing interleave mechanism does an even round-robin distribution of memory across all nodes in a nodemask, while weighted interleave distributes memory across nodes according to a provided weight. (Weight = # of page allocations per round) Weighted interleave is intended to reduce average latency when bandwidth is pressured - therefore increasing total throughput. In other words: It allows greater use of the total available bandwidth in a heterogeneous hardware environment (different hardware provides different bandwidth capacity). As bandwidth is pressured, latency increases - first linearly and then exponentially. By keeping bandwidth usage distributed according to available bandwidth, we therefore can reduce the average latency of a cacheline fetch. A good explanation of the bandwidth vs latency response curve: https://mahmoudhatem.wordpress.com/2017/11/07/memory-bandwidth-vs-latency-response-curve/ From the article: ``` Constant region: The latency response is fairly constant for the first 40% of the sustained bandwidth. Linear region: In between 40% to 80% of the sustained bandwidth, the latency response increases almost linearly with the bandwidth demand of the system due to contention overhead by numerous memory requests. Exponential region: Between 80% to 100% of the sustained bandwidth, the memory latency is dominated by the contention latency which can be as much as twice the idle latency or more. Maximum sustained bandwidth : Is 65% to 75% of the theoretical maximum bandwidth. ``` As a general rule of thumb: * If bandwidth usage is low, latency does not increase. It is optimal to place data in the nearest (lowest latency) device. * If bandwidth usage is high, latency increases. It is optimal to place data such that bandwidth use is optimized per-device. This is the top line goal: Provide a user a mechanism to target using the "maximum sustained bandwidth" of each hardware component in a heterogenous memory system. For example, the stream benchmark demonstrates that 1:1 (default) interleave is actively harmful, while weighted interleave can be beneficial. Default interleave distributes data such that too much pressure is placed on devices with lower available bandwidth. Stream Benchmark (vs DRAM, 1 Socket + 1 CXL Device) Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) Targeted weights : +2.5% to +4% (consistently better than DRAM) Global means the task-policy was set (set_mempolicy), while targeted means VMA policies were set (mbind2). We see weighted interleave is not always beneficial when applied globally, but is always beneficial when applied to bandwidth-driving memory regions. There are 4 patches in this set: 1) Implement system-global interleave weights as sysfs extension in mm/mempolicy.c. These weights are RCU protected, and a default weight set is provided (all weights are 1 by default). In future work, we intend to expose an interface for HMAT/CDAT code to set reasonable default values based on the memory configuration of the system discovered at boot/hotplug. 2) A mild refactor of some interleave-logic for re-use in the new weighted interleave logic. 3) MPOL_WEIGHTED_INTERLEAVE extension for set_mempolicy/mbind 4) Protect interleave logic (weighted and normal) with the mems_allowed seq cookie. If the nodemask changes while accessing it during a rebind, just retry the access. Included below are some performance and LTP test information, and a sample numactl branch which can be used for testing. = Performance summary = (tests may have different configurations, see extended info below) 1) MLC (W2) : +38% over DRAM. +264% over default interleave. MLC (W5) : +40% over DRAM. +226% over default interleave. 2) Stream : -6% to +4% over DRAM, +430% over default interleave. 3) XSBench : +19% over DRAM. +47% over default interleave. = LTP Testing Summary = existing mempolicy & mbind tests: pass mempolicy & mbind + weighted interleave (global weights): pass = version history v5: - style fixes - mems_allowed cookie protection to detect rebind issues, prevents spurious allocation failures and/or mis-allocations - sparse warning fixes related to __rcu on local variables ===================================================================== Performance tests - MLC From - Ravi Jonnalagadda <ravis.opensrc@micron.com> Hardware: Single-socket, multiple CXL memory expanders. Workload: W2 Data Signature: 2:1 read:write DRAM only bandwidth (GBps): 298.8 DRAM + CXL (default interleave) (GBps): 113.04 DRAM + CXL (weighted interleave)(GBps): 412.5 Gain over DRAM only: 1.38x Gain over default interleave: 2.64x Workload: W5 Data Signature: 1:1 read:write DRAM only bandwidth (GBps): 273.2 DRAM + CXL (default interleave) (GBps): 117.23 DRAM + CXL (weighted interleave)(GBps): 382.7 Gain over DRAM only: 1.4x Gain over default interleave: 2.26x ===================================================================== Performance test - Stream From - Gregory Price <gregory.price@memverge.com> Hardware: Single socket, single CXL expander numactl extension: https://github.com/gmprice/numactl/tree/weighted_interleave_master Summary: 64 threads, ~18GB workload, 3GB per array, executed 100 times Default interleave : -78% (slower than DRAM) Global weighting : -6% to +4% (workload dependant) mbind2 weights : +2.5% to +4% (consistently better than DRAM) dram only: numactl --cpunodebind=1 --membind=1 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Function Direction BestRateMBs AvgTime MinTime MaxTime Copy: 0->0 200923.2 0.032662 0.031853 0.033301 Scale: 0->0 202123.0 0.032526 0.031664 0.032970 Add: 0->0 208873.2 0.047322 0.045961 0.047884 Triad: 0->0 208523.8 0.047262 0.046038 0.048414 CXL-only: numactl --cpunodebind=1 -w --membind=2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 22209.7 0.288661 0.288162 0.289342 Scale: 0->0 22288.2 0.287549 0.287147 0.288291 Add: 0->0 24419.1 0.393372 0.393135 0.393735 Triad: 0->0 24484.6 0.392337 0.392083 0.394331 Based on the above, the optimal weights are ~9:1 echo 9 > /sys/kernel/mm/mempolicy/weighted_interleave/node1 echo 1 > /sys/kernel/mm/mempolicy/weighted_interleave/node2 default interleave: numactl --cpunodebind=1 --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 44666.2 0.143671 0.143285 0.144174 Scale: 0->0 44781.6 0.143256 0.142916 0.143713 Add: 0->0 48600.7 0.197719 0.197528 0.197858 Triad: 0->0 48727.5 0.197204 0.197014 0.197439 global weighted interleave: numactl --cpunodebind=1 -w --interleave=1,2 ./stream_c.exe --ntimes 100 --array-size 400M --malloc Copy: 0->0 190085.9 0.034289 0.033669 0.034645 Scale: 0->0 207677.4 0.031909 0.030817 0.033061 Add: 0->0 202036.8 0.048737 0.047516 0.053409 Triad: 0->0 217671.5 0.045819 0.044103 0.046755 targted regions w/ global weights (modified stream to mbind2 malloc'd regions)) numactl --cpunodebind=1 --membind=1 ./stream_c.exe -b --ntimes 100 --array-size 400M --malloc Copy: 0->0 205827.0 0.031445 0.031094 0.031984 Scale: 0->0 208171.8 0.031320 0.030744 0.032505 Add: 0->0 217352.0 0.045087 0.044168 0.046515 Triad: 0->0 216884.8 0.045062 0.044263 0.046982 ===================================================================== Performance tests - XSBench From - Hyeongtak Ji <hyeongtak.ji@sk.com> Hardware: Single socket, Single CXL memory Expander NUMA node 0: 56 logical cores, 128 GB memory NUMA node 2: 96 GB CXL memory Threads: 56 Lookups: 170,000,000 Summary: +19% over DRAM. +47% over default interleave. Performance tests - XSBench 1. dram only $ numactl -m 0 ./XSBench -s XL –p 5000000 Runtime: 36.235 seconds Lookups/s: 4,691,618 2. default interleave $ numactl –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 55.243 seconds Lookups/s: 3,077,293 3. weighted interleave numactl –w –i 0,2 ./XSBench –s XL –p 5000000 Runtime: 29.262 seconds Lookups/s: 5,809,513 ===================================================================== LTP Tests: https://github.com/gmprice/ltp/tree/mempolicy2 = Existing tests set_mempolicy, get_mempolicy, mbind MPOL_WEIGHTED_INTERLEAVE added manually to test basic functionality but did not adjust tests for weighting. Basically the weights were set to 1, which is the default, and it should behave the same as MPOL_INTERLEAVE if logic is correct. == set_mempolicy01 : passed 18, failed 0 == set_mempolicy02 : passed 10, failed 0 == set_mempolicy03 : passed 64, failed 0 == set_mempolicy04 : passed 32, failed 0 == set_mempolicy05 - n/a on non-x86 == set_mempolicy06 : passed 10, failed 0 this is set_mempolicy02 + MPOL_WEIGHTED_INTERLEAVE == set_mempolicy07 : passed 32, failed 0 set_mempolicy04 + MPOL_WEIGHTED_INTERLEAVE == get_mempolicy01 : passed 12, failed 0 change: added MPOL_WEIGHTED_INTERLEAVE == get_mempolicy02 : passed 2, failed 0 == mbind01 : passed 15, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind02 : passed 4, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind03 : passed 16, failed 0 added MPOL_WEIGHTED_INTERLEAVE == mbind04 : passed 48, failed 0 added MPOL_WEIGHTED_INTERLEAVE ===================================================================== numactl (set_mempolicy) w/ global weighting test numactl fork: https://github.com/gmprice/numactl/tree/weighted_interleave_master command: numactl -w --interleave=0,1 ./eatmem result (weights 1:1): 0176a000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=32897 N1=32896 kernelpagesize_kB=4 7fceeb9ff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=32768 N1=32769 kernelpagesize_kB=4 50% distribution is correct result (weights 5:1): 01b14000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=54828 N1=10965 kernelpagesize_kB=4 7f47a1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=54614 N1=10923 kernelpagesize_kB=4 16.666% distribution is correct result (weights 1:5): 01f07000 weighted interleave:0-1 heap anon=65793 dirty=65793 active=0 N0=10966 N1=54827 kernelpagesize_kB=4 7f17b1dff000 weighted interleave:0-1 anon=65537 dirty=65537 active=0 N0=10923 N1=54614 kernelpagesize_kB=4 16.666% distribution is correct #include <stdio.h> #include <stdlib.h> #include <string.h> int main (void) { char* mem = malloc(1024*1024*256); memset(mem, 1, 1024*1024*256); for (int i = 0; i < ((1024*1024*256)/4096); i++) { mem = malloc(4096); mem[0] = 1; } printf("done\n"); getchar(); return 0; } This patch (of 4): This patch provides a way to set interleave weight information under sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN The sysfs structure is designed as follows. $ tree /sys/kernel/mm/mempolicy/ /sys/kernel/mm/mempolicy/ [1] └── weighted_interleave [2] ├── node0 [3] └── node1 Each file above can be explained as follows. [1] mm/mempolicy: configuration interface for mempolicy subsystem [2] weighted_interleave/: config interface for weighted interleave policy [3] weighted_interleave/nodeN: weight for nodeN If a node value is set to `0`, the system-default value will be used. As of this patch, the system-default for all nodes is always 1. Link: https://lkml.kernel.org/r/20240202170238.90004-1-gregory.price@memverge.com Link: https://lkml.kernel.org/r/20240202170238.90004-2-gregory.price@memverge.com Suggested-by: "Huang, Ying" <ying.huang@intel.com> Signed-off-by: Rakie Kim <rakie.kim@sk.com> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com> Co-developed-by: Gregory Price <gregory.price@memverge.com> Signed-off-by: Gregory Price <gregory.price@memverge.com> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Gregory Price <gourry.memverge@gmail.com> Cc: Hasan Al Maruf <Hasan.Maruf@amd.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@kernel.org> Cc: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2024-02-02 17:02:35 +00:00
#ifdef CONFIG_SYSFS
struct iw_node_attr {
struct kobj_attribute kobj_attr;
int nid;
};
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct iw_node_attr *node_attr;
u8 weight;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
weight = get_il_weight(node_attr->nid);
return sysfs_emit(buf, "%d\n", weight);
}
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct iw_node_attr *node_attr;
u8 *new;
u8 *old;
u8 weight = 0;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
if (count == 0 || sysfs_streq(buf, ""))
weight = 0;
else if (kstrtou8(buf, 0, &weight))
return -EINVAL;
new = kzalloc(nr_node_ids, GFP_KERNEL);
if (!new)
return -ENOMEM;
mutex_lock(&iw_table_lock);
old = rcu_dereference_protected(iw_table,
lockdep_is_held(&iw_table_lock));
if (old)
memcpy(new, old, nr_node_ids);
new[node_attr->nid] = weight;
rcu_assign_pointer(iw_table, new);
mutex_unlock(&iw_table_lock);
synchronize_rcu();
kfree(old);
return count;
}
static struct iw_node_attr **node_attrs;
static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
struct kobject *parent)
{
if (!node_attr)
return;
sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
kfree(node_attr->kobj_attr.attr.name);
kfree(node_attr);
}
static void sysfs_wi_release(struct kobject *wi_kobj)
{
int i;
for (i = 0; i < nr_node_ids; i++)
sysfs_wi_node_release(node_attrs[i], wi_kobj);
kobject_put(wi_kobj);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = sysfs_wi_release,
};
static int add_weight_node(int nid, struct kobject *wi_kobj)
{
struct iw_node_attr *node_attr;
char *name;
node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
if (!node_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
kfree(node_attr);
return -ENOMEM;
}
sysfs_attr_init(&node_attr->kobj_attr.attr);
node_attr->kobj_attr.attr.name = name;
node_attr->kobj_attr.attr.mode = 0644;
node_attr->kobj_attr.show = node_show;
node_attr->kobj_attr.store = node_store;
node_attr->nid = nid;
if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
kfree(node_attr->kobj_attr.attr.name);
kfree(node_attr);
pr_err("failed to add attribute to weighted_interleave\n");
return -ENOMEM;
}
node_attrs[nid] = node_attr;
return 0;
}
static int add_weighted_interleave_group(struct kobject *root_kobj)
{
struct kobject *wi_kobj;
int nid, err;
wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
if (!wi_kobj)
return -ENOMEM;
err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
"weighted_interleave");
if (err) {
kfree(wi_kobj);
return err;
}
for_each_node_state(nid, N_POSSIBLE) {
err = add_weight_node(nid, wi_kobj);
if (err) {
pr_err("failed to add sysfs [node%d]\n", nid);
break;
}
}
if (err)
kobject_put(wi_kobj);
return 0;
}
static void mempolicy_kobj_release(struct kobject *kobj)
{
u8 *old;
mutex_lock(&iw_table_lock);
old = rcu_dereference_protected(iw_table,
lockdep_is_held(&iw_table_lock));
rcu_assign_pointer(iw_table, NULL);
mutex_unlock(&iw_table_lock);
synchronize_rcu();
kfree(old);
kfree(node_attrs);
kfree(kobj);
}
static const struct kobj_type mempolicy_ktype = {
.release = mempolicy_kobj_release
};
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
if (!mempolicy_kobj) {
err = -ENOMEM;
goto err_out;
}
node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
GFP_KERNEL);
if (!node_attrs) {
err = -ENOMEM;
goto mempol_out;
}
err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
"mempolicy");
if (err)
goto node_out;
err = add_weighted_interleave_group(mempolicy_kobj);
if (err) {
pr_err("mempolicy sysfs structure failed to initialize\n");
kobject_put(mempolicy_kobj);
return err;
}
return err;
node_out:
kfree(node_attrs);
mempol_out:
kfree(mempolicy_kobj);
err_out:
pr_err("failed to add mempolicy kobject to the system\n");
return err;
}
late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */