diff --git a/.mailmap b/.mailmap index ac3decd2c756..318e63f338b1 100644 --- a/.mailmap +++ b/.mailmap @@ -130,6 +130,7 @@ Domen Puncer Douglas Gilbert Ed L. Cashin Erik Kaneda +Eugen Hristev Evgeniy Polyakov Ezequiel Garcia Felipe W Damasio diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c8ae7c897f14..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a string which contains the number of bytes to - reclaim. + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. Example:: echo "1G" > memory.reclaim + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. - This file also allows the user to specify the nodes to reclaim from, - via the 'nodes=' key, for example:: - - echo "1G nodes=0,1" > memory.reclaim - - The above instructs the kernel to reclaim memory from nodes 0,1. - memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f6a502e8f02c..6e948d015332 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u asmlinkage long ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) { + struct timespec64 rtn_tp; + s64 tick_ns; + /* * ia64's clock_gettime() syscall is implemented as a vdso call * fsys_clock_gettime(). Currently it handles only @@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * switch (which_clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + rtn_tp = ns_to_timespec64(tick_ns); return put_timespec64(&rtn_tp, tp); } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 3161b9ccd2a5..b6276a3521d7 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -4,6 +4,7 @@ * Written by Niibe Yutaka and Paul Mundt */ OUTPUT_ARCH(sh) +#define RUNTIME_DISCARD_EXIT #include #include #include diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index f08b25195ae7..d1a68b6d03b3 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -26,7 +26,6 @@ #include #include #include -#include #include /* for COMMAND_LINE_SIZE */ #include @@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, size = dt_mem_next_cell(dt_root_size_cells, &prop); if (size && - early_init_dt_reserve_memory(base, size, nomap) == 0) { + early_init_dt_reserve_memory(base, size, nomap) == 0) pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); - if (!nomap) - kmemleak_alloc_phys(base, size, 0); - } else pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index c05c71d57291..0e2fc08f7de4 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -8,7 +8,7 @@ config VXFS_FS of SCO UnixWare (and possibly others) and optionally available for Sunsoft Solaris, HP-UX and many other operating systems. However these particular OS implementations of vxfs may differ in on-disk - data endianess and/or superblock offset. The vxfs module has been + data endianness and/or superblock offset. The vxfs module has been tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) Currently only readonly access is supported and VxFX versions 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e35a0398db63..af1c49ae11b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, page = pfn_swap_entry_to_page(swpent); } if (page) { - int mapcount = page_mapcount(page); - - if (mapcount >= 2) + if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b3fdc8212c5f..95f8e8901768 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ sizeof(u64)) /* xattr id lookup table defines */ -#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) +#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ SQUASHFS_METADATA_SIZE) diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 659082e9e51d..72f6f4b37863 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -63,7 +63,7 @@ struct squashfs_sb_info { long long bytes_used; unsigned int inodes; unsigned int fragments; - int xattr_ids; + unsigned int xattr_ids; unsigned int ids; bool panic_on_errors; const struct squashfs_decompressor_thread_ops *thread_ops; diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index d8a270d3ac4c..f1a463d8bfa0 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -10,12 +10,12 @@ #ifdef CONFIG_SQUASHFS_XATTR extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, - u64 *, int *); + u64 *, unsigned int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, - u64 start, u64 *xattr_table_start, int *xattr_ids) + u64 start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_xattr_id_table *id_table; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index 087cab8c78f4..b88d19e9581e 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, - u64 *xattr_table_start, int *xattr_ids) + u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; @@ -76,7 +76,7 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, /* Sanity check values */ /* there is always at least one xattr id */ - if (*xattr_ids == 0) + if (*xattr_ids <= 0) return ERR_PTR(-EINVAL); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 034b1106d022..e098f38422af 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } @@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); + kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..db194e2ba69f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -1187,6 +1188,18 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return page_count(virt_to_page(pte)) > 1; +} +#else +static inline bool hugetlb_pmd_shared(pte_t *pte) +{ + return false; +} +#endif + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d3c8203cab6c..85dc9b88ea37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1666,10 +1666,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { + struct mem_cgroup *memcg; + if (mem_cgroup_disabled()) return; - if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) + memcg = folio_memcg(folio); + if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 61a9425a311f..02ee440f7be3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK select KALLSYMS select CRC32 select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF help Say Y here if you want to enable the memory leak detector. The memory allocation/freeing is traced in a way @@ -1207,7 +1208,7 @@ config SCHED_DEBUG depends on DEBUG_KERNEL && PROC_FS default y help - If you say Y here, the /proc/sched_debug file will be provided + If you say Y here, the /sys/kernel/debug/sched file will be provided that can help debug the scheduler. The runtime overhead of this option is minimal. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 26e2045d3cda..5a976393c9ae 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -670,12 +670,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn, unsigned char piv) { struct maple_node *node = mte_to_node(mn); + enum maple_type type = mte_node_type(mn); - if (piv >= mt_pivots[piv]) { + if (piv >= mt_pivots[type]) { WARN_ON(1); return 0; } - switch (mte_node_type(mn)) { + switch (type) { case maple_arange_64: return node->ma64.pivot[piv]; case maple_range_64: @@ -4887,7 +4888,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; - unsigned long max, min, index; + unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) @@ -4909,8 +4910,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); - index = mas->index; - while (index <= max) { + while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; @@ -4941,10 +4941,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); } - if (unlikely(index > max)) { - mas_set_err(mas, -EBUSY); - return false; - } + if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) + goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; @@ -4961,9 +4959,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) return false; ascend: - if (mte_is_root(mas->node)) - mas_set_err(mas, -EBUSY); + if (!mte_is_root(mas->node)) + return false; +no_space: + mas_set_err(mas, -EBUSY); return false; } diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 497fc93ccf9e..ec847bf4dcb4 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) mt_set_non_kernel(0); } +static noinline void check_empty_area_window(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 20; + MA_STATE(mas, mt, 0, 0); + + for (i = 1; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 9, + xa_mk_value(i), GFP_KERNEL); + + /* Create another hole besides the one at 0 */ + mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL); + + /* Check lower bounds that don't fit */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY); + + /* Check lower bound that does fit */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0); + MT_BUG_ON(mt, mas.index != 5); + MT_BUG_ON(mt, mas.last != 9); + rcu_read_unlock(); + + /* Check one gap that doesn't fit and one that does */ + rcu_read_lock(); + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0); + MT_BUG_ON(mt, mas.index != 161); + MT_BUG_ON(mt, mas.last != 169); + + /* Check one gap that does fit above the min */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0); + MT_BUG_ON(mt, mas.index != 216); + MT_BUG_ON(mt, mas.last != 218); + + /* Check size that doesn't fit any gap */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY); + + /* + * Check size that doesn't fit the lower end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY); + + /* + * Check size that doesn't fit the upper end of the window but + * does fit the gap + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY); + + /* Check mas_empty_area forward */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 8); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY); + + mas_reset(&mas); + mas_empty_area(&mas, 100, 165, 3); + + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY); + rcu_read_unlock(); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2765,6 +2850,10 @@ static int maple_tree_seed(void) check_bnode_min_spanning(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_window(&tree); + mtree_destroy(&tree); + #if defined(BENCH) skip: #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7fcdb98c9e68..bdbfeb6fb393 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5051,6 +5051,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { + /* No swap on hugetlb */ + WARN_ON_ONCE( + is_swapin_error_entry(pte_to_swp_entry(entry))); /* * We copy the pte marker only if the dst vma has * uffd-wp enabled. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79be13133322..90acfea40c13 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ static int find_pmd_or_thp_or_none(struct mm_struct *mm, unsigned long address, pmd_t **pmd) @@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, #endif if (pmd_none(pmde)) return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; if (pmd_bad(pmde)) return SCAN_PMD_NULL; return SCAN_SUCCEED; @@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * has higher cost too. It would also probably require locking * the anon_vma. */ - if (vma->anon_vma) { + if (READ_ONCE(vma->anon_vma)) { result = SCAN_PAGE_ANON; goto next; } @@ -1670,6 +1678,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, result = SCAN_PTE_MAPPED_HUGEPAGE; if ((cc->is_khugepaged || is_target) && mmap_write_trylock(mm)) { + /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92f670edbf51..55dc8b8b0616 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") == 0) + else if (strcmp(str, "on") == 0) { kmemleak_skip_disable = 1; + stack_depot_want_early_init(); + } else return -EINVAL; return 0; @@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) if (kmemleak_error) return; - stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab457f0394ab..73afff8062f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,7 +63,6 @@ #include #include #include -#include #include "internal.h" #include #include @@ -2393,8 +2392,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP, - NULL); + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2685,8 +2683,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options, - NULL); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3506,8 +3503,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, - NULL)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3618,8 +3614,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP, - NULL)) + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6429,8 +6424,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6479,8 +6473,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, - NULL)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6603,54 +6596,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_NODES = 0, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t if_tokens = { - { MEMORY_RECLAIM_NODES, "nodes=%s" }, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | - MEMCG_RECLAIM_PROACTIVE; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - int token; - char value[256]; - nodemask_t nodemask = NODE_MASK_ALL; + unsigned int reclaim_options; + int err; buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; - - buf = strstrip(buf); - - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - token = match_token(start, if_tokens, args); - match_strlcpy(value, args, sizeof(value)); - switch (token) { - case MEMORY_RECLAIM_NODES: - if (nodelist_parse(value, nodemask) < 0) - return -EINVAL; - break; - default: - return -EINVAL; - } - } - + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6667,8 +6627,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options, - &nodemask); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory.c b/mm/memory.c index aad226daf41b..3e836fecd035 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -828,12 +828,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - /* - * We're copying the pgtable should only because dst_vma has - * uffd-wp enabled, do sanity check. - */ - WARN_ON_ONCE(!userfaultfd_wp(dst_vma)); - set_pte_at(dst_mm, addr, dst_pte, pte); + if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) + set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -3629,8 +3625,12 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. + * + * This should also cover the case where e.g. the pte changed + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * So is_pte_marker() check is not enough to safely drop the pte. */ - if (is_pte_marker(*vmf->pte)) + if (pte_same(vmf->orig_pte, *vmf->pte)) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02c8a712282f..f940395667c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) { if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 908df12caa26..61cf60015a8b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -245,7 +245,13 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mksoft_dirty(newpte); if (pte_swp_uffd_wp(oldpte)) newpte = pte_swp_mkuffd_wp(newpte); - } else if (pte_marker_entry_uffd_wp(entry)) { + } else if (is_pte_marker_entry(entry)) { + /* + * Ignore swapin errors unconditionally, + * because any access should sigbus anyway. + */ + if (is_swapin_error_entry(entry)) + continue; /* * If this is uffd-wp pte marker and we'd like * to unprotect it, drop it; the next page diff --git a/mm/mremap.c b/mm/mremap.c index fe587c5d6591..930f65c315c0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we are adding to - * the already existing vma, vma_merge() will merge this extension with - * the already existing vma (expand operation itself) and possibly also - * with the next vma if it becomes adjacent to the expanded vma and - * otherwise compatible. + * Function vma_merge() is called on the extension we + * are adding to the already existing vma, vma_merge() + * will merge this extension with the already existing + * vma (expand operation itself) and possibly also with + * the next vma if it becomes adjacent to the expanded + * vma and otherwise compatible. + * + * However, vma_merge() can currently fail due to + * is_mergeable_vma() check for vm_ops->close (see the + * comment there). Yet this should not prevent vma + * expanding, so perform a simple expand for such vma. + * Ideally the check for close op should be only done + * when a vma would be actually removed due to a merge. */ - vma = vma_merge(mm, vma, extension_start, extension_end, + if (!vma->vm_ops || !vma->vm_ops->close) { + vma = vma_merge(mm, vma, extension_start, extension_end, vma->vm_flags, vma->anon_vma, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + vma = NULL; + } if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..4fa440e87cd6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1100,6 +1100,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); + cond_resched(); spin_lock(&swap_avail_lock); nextsi: diff --git a/mm/vmscan.c b/mm/vmscan.c index bd6637fcd8f9..bf3eedf0209c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3323,13 +3323,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) if (mem_cgroup_disabled()) return; + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(task); rcu_read_unlock(); if (memcg == mm->lru_gen.memcg) return; - VM_WARN_ON_ONCE(!mm->lru_gen.memcg); VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); lru_gen_del_mm(mm); @@ -6754,8 +6757,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options, - nodemask_t *nodemask) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6770,7 +6772,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), - .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9445bee6b014..702bc3fd687a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -113,7 +113,23 @@ * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 -#define OBJ_TAG_BITS 1 + +#ifdef CONFIG_ZPOOL +/* + * The second least-significant bit in the object's header identifies if the + * value stored at the header is a deferred handle from the last reclaim + * attempt. + * + * As noted above, this is valid because we have room for two bits. + */ +#define OBJ_DEFERRED_HANDLE_TAG 2 +#define OBJ_TAG_BITS 2 +#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG) +#else +#define OBJ_TAG_BITS 1 +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG +#endif /* CONFIG_ZPOOL */ + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) @@ -222,6 +238,12 @@ struct link_free { * Handle of allocated object. */ unsigned long handle; +#ifdef CONFIG_ZPOOL + /* + * Deferred handle of a reclaimed object. + */ + unsigned long deferred_handle; +#endif }; }; @@ -272,8 +294,6 @@ struct zspage { /* links the zspage to the lru list in the pool */ struct list_head lru; bool under_reclaim; - /* list of unfreed handles whose objects have been reclaimed */ - unsigned long *deferred_handles; #endif struct zs_pool *pool; @@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, + int tag) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) } else handle = *(unsigned long *)obj; - if (!(handle & OBJ_ALLOCATED_TAG)) + if (!(handle & tag)) return false; - *phandle = handle & ~OBJ_ALLOCATED_TAG; + /* Clear all tags before returning the handle */ + *phandle = handle & ~OBJ_TAG_MASK; return true; } +static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +static bool obj_stores_deferred_handle(struct page *page, void *obj, + unsigned long *phandle) +{ + return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage) } #ifdef CONFIG_ZPOOL +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx); + /* * Free all the deferred handles whose objects are freed in zs_free. */ -static void free_handles(struct zs_pool *pool, struct zspage *zspage) +static void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) { - unsigned long handle = (unsigned long)zspage->deferred_handles; + int obj_idx = 0; + struct page *page = get_first_page(zspage); + unsigned long handle; - while (handle) { - unsigned long nxt_handle = handle_to_obj(handle); + while (1) { + handle = find_deferred_handle_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } cache_free_handle(pool, handle); - handle = nxt_handle; + obj_idx++; } } #else -static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +static inline void free_handles(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) {} #endif static void __free_zspage(struct zs_pool *pool, struct size_class *class, @@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(fg != ZS_EMPTY); /* Free all deferred handles from zs_free */ - free_handles(pool, zspage); + free_handles(pool, class, zspage); next = page = get_first_page(zspage); do { @@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); zspage->under_reclaim = false; - zspage->deferred_handles = NULL; #endif set_freeobj(zspage, 0); @@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(int class_size, unsigned long obj) +static void obj_free(int class_size, unsigned long obj, unsigned long *handle) { struct link_free *link; struct zspage *zspage; @@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); - - /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - if (likely(!ZsHugePage(zspage))) - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - else - f_page->index = 0; + + if (handle) { +#ifdef CONFIG_ZPOOL + /* Stores the (deferred) handle in the object's header */ + *handle |= OBJ_DEFERRED_HANDLE_TAG; + *handle &= ~OBJ_ALLOCATED_TAG; + + if (likely(!ZsHugePage(zspage))) + link->deferred_handle = *handle; + else + f_page->index = *handle; +#endif + } else { + /* Insert this object in containing zspage's freelist */ + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + set_freeobj(zspage, f_objidx); + } + kunmap_atomic(vaddr); - set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); } @@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); #ifdef CONFIG_ZPOOL @@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) * Reclaim needs the handles during writeback. It'll free * them along with the zspage when it's done with them. * - * Record current deferred handle at the memory location - * whose address is given by handle. + * Record current deferred handle in the object's header. */ - record_obj(handle, (unsigned long)zspage->deferred_handles); - zspage->deferred_handles = (unsigned long *)handle; + obj_free(class->size, obj, &handle); spin_unlock(&pool->lock); return; } #endif + obj_free(class->size, obj, NULL); + fullness = fix_fullness_group(class, zspage); if (fullness == ZS_EMPTY) free_zspage(pool, class, zspage); @@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find alloced object in zspage from index object and + * Find object with a certain tag in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) +static unsigned long find_tagged_obj(struct size_class *class, + struct page *page, int *obj_idx, int tag) { unsigned int offset; int index = *obj_idx; @@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) + if (obj_tagged(page, addr + offset, &handle, tag)) break; offset += class->size; @@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, return handle; } +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); +} + +#ifdef CONFIG_ZPOOL +/* + * Find object storing a deferred handle in header in zspage from index object + * and return handle. + */ +static unsigned long find_deferred_handle_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG); +} +#endif + struct zs_compact_control { /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; @@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); - obj_free(class->size, used_obj); + obj_free(class->size, used_obj, NULL); } /* Remember last position in this iteration */ @@ -2478,6 +2561,90 @@ void zs_destroy_pool(struct zs_pool *pool) EXPORT_SYMBOL_GPL(zs_destroy_pool); #ifdef CONFIG_ZPOOL +static void restore_freelist(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + unsigned int obj_idx = 0; + unsigned long handle, off = 0; /* off is within-page offset */ + struct page *page = get_first_page(zspage); + struct link_free *prev_free = NULL; + void *prev_page_vaddr = NULL; + + /* in case no free object found */ + set_freeobj(zspage, (unsigned int)(-1UL)); + + while (page) { + void *vaddr = kmap_atomic(page); + struct page *next_page; + + while (off < PAGE_SIZE) { + void *obj_addr = vaddr + off; + + /* skip allocated object */ + if (obj_allocated(page, obj_addr, &handle)) { + obj_idx++; + off += class->size; + continue; + } + + /* free deferred handle from reclaim attempt */ + if (obj_stores_deferred_handle(page, obj_addr, &handle)) + cache_free_handle(pool, handle); + + if (prev_free) + prev_free->next = obj_idx << OBJ_TAG_BITS; + else /* first free object found */ + set_freeobj(zspage, obj_idx); + + prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free); + /* if last free object in a previous page, need to unmap */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + obj_idx++; + off += class->size; + } + + /* + * Handle the last (full or partial) object on this page. + */ + next_page = get_next_page(page); + if (next_page) { + if (!prev_free || prev_page_vaddr) { + /* + * There is no free object in this page, so we can safely + * unmap it. + */ + kunmap_atomic(vaddr); + } else { + /* update prev_page_vaddr since prev_free is on this page */ + prev_page_vaddr = vaddr; + } + } else { /* this is the last page */ + if (prev_free) { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + prev_free->next = -1UL << OBJ_TAG_BITS; + } + + /* unmap previous page (if not done yet) */ + if (prev_page_vaddr) { + kunmap_atomic(prev_page_vaddr); + prev_page_vaddr = NULL; + } + + kunmap_atomic(vaddr); + } + + page = next_page; + off %= PAGE_SIZE; + } +} + static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) { int i, obj_idx, ret = 0; @@ -2561,6 +2728,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) return 0; } + /* + * Eviction fails on one of the handles, so we need to restore zspage. + * We need to rebuild its freelist (and free stored deferred handles), + * put it back to the correct size class, and add it to the LRU list. + */ + restore_freelist(pool, class, zspage); putback_zspage(class, zspage); list_add(&zspage->lru, &pool->lru); unlock_zspage(zspage); diff --git a/tools/testing/selftests/filesystems/fat/run_fat_tests.sh b/tools/testing/selftests/filesystems/fat/run_fat_tests.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index a634f47d1e56..9a127a8fe176 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -17,7 +17,6 @@ #include #include #include -#define __USE_GNU #include #define MIN_FREE_PAGES 20