Merge branch 'akpm' (patches from Andrew)

Merge updates from Andrew Morton:

 - various misc bits

 - most of MM (quite a lot of MM material is awaiting the merge of
   linux-next dependencies)

 - kasan

 - printk updates

 - procfs updates

 - MAINTAINERS

 - /lib updates

 - checkpatch updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (123 commits)
  init: reduce rootwait polling interval time to 5ms
  binfmt_elf: use vmalloc() for allocation of vma_filesz
  checkpatch: don't emit unified-diff error for rename-only patches
  checkpatch: don't check c99 types like uint8_t under tools
  checkpatch: avoid multiple line dereferences
  checkpatch: don't check .pl files, improve absolute path commit log test
  scripts/checkpatch.pl: fix spelling
  checkpatch: don't try to get maintained status when --no-tree is given
  lib/ida: document locking requirements a bit better
  lib/rbtree.c: fix typo in comment of ____rb_erase_color
  lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM
  MAINTAINERS: add drm and drm/i915 irc channels
  MAINTAINERS: add "C:" for URI for chat where developers hang out
  MAINTAINERS: add drm and drm/i915 bug filing info
  MAINTAINERS: add "B:" for URI where to file bugs
  get_maintainer: look for arbitrary letter prefixes in sections
  printk: add Kconfig option to set default console loglevel
  printk/sound: handle more message headers
  printk/btrfs: handle more message headers
  printk/kdb: handle more message headers
  ...
This commit is contained in:
Linus Torvalds 2016-12-12 20:50:02 -08:00
commit e34bac726d
113 changed files with 1554 additions and 1045 deletions

View file

@ -974,6 +974,13 @@ compatibility.
4Gb. Some vendors prefer splitting those ranges into smaller
segments, but the kernel doesn't care.
Additional properties:
- hotpluggable : The presence of this property provides an explicit
hint to the operating system that this memory may potentially be
removed later. The kernel can take this into consideration when
doing nonmovable allocations and when laying out memory zones.
e) The /chosen node
This node is a bit "special". Normally, that's where Open Firmware

View file

@ -191,6 +191,7 @@ read the file /proc/PID/status:
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: ffffffffffffffff
NoNewPrivs: 0
Seccomp: 0
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1
@ -262,6 +263,7 @@ Table 1-2: Contents of the status files (as of 4.1)
CapPrm bitmap of permitted capabilities
CapEff bitmap of effective capabilities
CapBnd bitmap of capabilities bounding set
NoNewPrivs no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...)
Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...)
Cpus_allowed mask of CPUs on which this process may run
Cpus_allowed_list Same as previous, but in "list format"

View file

@ -2397,7 +2397,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
that the amount of memory usable for all allocations
is not too small.
movable_node [KNL,X86] Boot-time switch to enable the effects
movable_node [KNL] Boot-time switch to enable the effects
of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
MTD_Partition= [MTD]

View file

@ -136,6 +136,11 @@ or enable it back by writing 1:
echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
Some userspace (such as a test program, or an optimized memory allocation
library) may want to know the size (in bytes) of a transparent hugepage:
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
khugepaged will be automatically started when
transparent_hugepage/enabled is set to "always" or "madvise, and it'll
be automatically shutdown if it's set to "never".

View file

@ -74,6 +74,10 @@ Descriptions of section entries:
These reviewers should be CCed on patches.
L: Mailing list that is relevant to this area
W: Web-page with status/info
B: URI for where to file bugs. A web-page with detailed bug
filing info, a direct bug tracker link, or a mailto: URI.
C: URI for chat protocol, server and channel where developers
usually hang out, for example irc://server/channel.
Q: Patchwork web based patch tracking system site
T: SCM tree type and location.
Type is one of: git, hg, quilt, stgit, topgit
@ -4024,6 +4028,8 @@ DRM DRIVERS
M: David Airlie <airlied@linux.ie>
L: dri-devel@lists.freedesktop.org
T: git git://people.freedesktop.org/~airlied/linux
B: https://bugs.freedesktop.org/
C: irc://chat.freenode.net/dri-devel
S: Maintained
F: drivers/gpu/drm/
F: drivers/gpu/vga/
@ -4076,6 +4082,8 @@ M: Jani Nikula <jani.nikula@linux.intel.com>
L: intel-gfx@lists.freedesktop.org
L: dri-devel@lists.freedesktop.org
W: https://01.org/linuxgraphics/
B: https://01.org/linuxgraphics/documentation/how-report-bugs
C: irc://chat.freenode.net/intel-gfx
Q: http://patchwork.freedesktop.org/project/intel-gfx/
T: git git://anongit.freedesktop.org/drm-intel
S: Supported

View file

@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
tlb_add_flush(tlb, addr);
}
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
tlb_remove_tlb_entry(tlb, ptep, address)
/*
* In the case of tlb vma handling, we can optimise these away in the
* case where we're doing a full MM flush. When we're doing a munmap,
@ -211,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
tlb->pages[tlb->nr++] = page;
VM_WARN_ON(tlb->nr > tlb->max);
if (tlb->nr == tlb->max)
return true;
tlb->pages[tlb->nr++] = page;
return false;
}
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
if (__tlb_remove_page(tlb, page)) {
if (__tlb_remove_page(tlb, page))
tlb_flush_mmu(tlb);
__tlb_remove_page(tlb, page);
}
}
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@ -231,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
return __tlb_remove_page(tlb, page);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
struct page *page)
{
return __tlb_remove_page(tlb, page);
}
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
@ -284,5 +279,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr
#define tlb_migrate_finish(mm) do { } while (0)
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
}
#endif /* CONFIG_MMU */
#endif

View file

@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
*/
static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
if (tlb->nr == tlb->max)
return true;
tlb->need_flush = 1;
if (!tlb->nr && tlb->pages == tlb->local)
__tlb_alloc_page(tlb);
tlb->pages[tlb->nr++] = page;
VM_WARN_ON(tlb->nr > tlb->max);
if (tlb->nr == tlb->max)
return true;
return false;
}
@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
if (__tlb_remove_page(tlb, page)) {
if (__tlb_remove_page(tlb, page))
tlb_flush_mmu(tlb);
__tlb_remove_page(tlb, page);
}
}
static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
return __tlb_remove_page(tlb, page);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
struct page *page)
{
return __tlb_remove_page(tlb, page);
}
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
@ -283,6 +275,15 @@ do { \
__tlb_remove_tlb_entry(tlb, ptep, addr); \
} while (0)
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
tlb_remove_tlb_entry(tlb, ptep, address)
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
}
#define pte_free_tlb(tlb, ptep, address) \
do { \
tlb->need_flush = 1; \

View file

@ -34,7 +34,7 @@ config NO_IOPORT_MAP
def_bool y
config NO_DMA
def_bool y
def_bool n
config HZ
int

View file

@ -3,5 +3,9 @@
*
* This file is released under the GPLv2
*/
#include <asm-generic/device.h>
struct dev_archdata {
struct dma_map_ops *dma_ops;
};
struct pdev_archdata {
};

View file

@ -0,0 +1,32 @@
#ifndef _ASM_M32R_DMA_MAPPING_H
#define _ASM_M32R_DMA_MAPPING_H
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
#include <linux/io.h>
#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
static inline struct dma_map_ops *get_dma_ops(struct device *dev)
{
if (dev && dev->archdata.dma_ops)
return dev->archdata.dma_ops;
return &dma_noop_ops;
}
static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction direction)
{
}
static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
{
if (!dev->dma_mask)
return false;
return addr + size - 1 <= *dev->dma_mask;
}
#endif /* _ASM_M32R_DMA_MAPPING_H */

View file

@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type =
#define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \
(((x) - 1) * sizeof(unsigned short)))
#ifdef CONFIG_USB
static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ];
static void disable_m32700ut_lcdpld_irq(unsigned int irq)
@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type =
.irq_mask = mask_m32700ut_lcdpld,
.irq_unmask = unmask_m32700ut_lcdpld,
};
#endif
void __init init_IRQ(void)
{

View file

@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
#define pmd_move_must_withdraw pmd_move_must_withdraw
struct spinlock;
static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
struct spinlock *old_pmd_ptl)
struct spinlock *old_pmd_ptl,
struct vm_area_struct *vma)
{
if (radix_enabled())
return false;
@ -1020,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
*/
return true;
}
#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
static inline bool arch_needs_pgtable_deposit(void)
{
if (radix_enabled())
return false;
return true;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

View file

@ -28,6 +28,7 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
#define __tlb_remove_tlb_entry __tlb_remove_tlb_entry
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
extern void tlb_flush(struct mmu_gather *tlb);
@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
#endif
}
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
if (!tlb->page_size)
tlb->page_size = page_size;
else if (tlb->page_size != page_size) {
tlb_flush_mmu(tlb);
/*
* update the page size after flush for the new
* mmu_gather.
*/
tlb->page_size = page_size;
}
}
#ifdef CONFIG_SMP
static inline int mm_is_core_local(struct mm_struct *mm)
{

View file

@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
int hot_add_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory = NULL;
int nid, found = 0;
int nid;
if (!numa_enabled || (min_common_depth < 0))
return first_online_node;
@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
if (nid < 0 || !node_online(nid))
nid = first_online_node;
if (NODE_DATA(nid)->node_spanned_pages)
return nid;
for_each_online_node(nid) {
if (NODE_DATA(nid)->node_spanned_pages) {
found = 1;
break;
}
}
BUG_ON(!found);
return nid;
}

View file

@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
return __tlb_remove_page(tlb, page);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
struct page *page)
{
return __tlb_remove_page(tlb, page);
}
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
@ -162,5 +156,13 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0)
#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0)
#define tlb_migrate_finish(mm) do { } while (0)
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
tlb_remove_tlb_entry(tlb, ptep, address)
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
}
#endif /* _S390_TLB_H */

View file

@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
radix_tree_replace_slot(slot, rmap);
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,

View file

@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
tlb->end = address + PAGE_SIZE;
}
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
tlb_remove_tlb_entry(tlb, ptep, address)
/*
* In the case of tlb vma handling, we can optimise these away in the
* case where we're doing a full MM flush. When we're doing a munmap,
@ -115,18 +118,18 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
return __tlb_remove_page(tlb, page);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
struct page *page)
{
return __tlb_remove_page(tlb, page);
}
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
return tlb_remove_page(tlb, page);
}
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
}
#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep)
#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)
#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)

View file

@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
return __tlb_remove_page(tlb, page);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
struct page *page)
{
return __tlb_remove_page(tlb, page);
}
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
@ -141,6 +135,15 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
tlb_remove_tlb_entry(tlb, ptep, address)
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
}
#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr)

View file

@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
vfree(ldt->entries);
vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);

View file

@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
* cannot migrate the kernel pages. When memory hotplug is
* enabled, we should prevent memblock from allocating memory
* for the kernel.
*
* ACPI SRAT records all hotpluggable memory ranges. But before
* SRAT is parsed, we don't know about it.
*
* The kernel image is loaded into memory at very early time. We
* cannot prevent this anyway. So on NUMA system, we set any
* node the kernel resides in as un-hotpluggable.
*
* Since on modern servers, one node could have double-digit
* gigabytes memory, we can assume the memory around the kernel
* image is also un-hotpluggable. So before SRAT is parsed, just
* allocate memory near the kernel image to try the best to keep
* the kernel away from hotpluggable memory.
*/
if (movable_node_is_enabled())
memblock_set_bottom_up(true);
#endif
x86_report_nx();
/* after early param, so could get panic from serial */

View file

@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
limits->max_sectors = max_sectors;
q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);

View file

@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
spin_lock_irq(q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(q->queue_lock);
return ret;

View file

@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
const __be32 *reg, *endp;
int l;
bool hotpluggable;
/* We are scanning "memory" nodes only */
if (type == NULL) {
@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
return 0;
endp = reg + (l / sizeof(__be32));
hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
pr_debug("memory scan node %s, reg size %d,\n", uname, l);
@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
(unsigned long long)size);
early_init_dt_add_memory_arch(base, size);
if (!hotpluggable)
continue;
if (early_init_dt_mark_hotplug_memory_arch(base, size))
pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
base, base + size);
}
return 0;
@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
memblock_add(base, size);
}
int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
{
return memblock_mark_hotplug(base, size);
}
int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
{
@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
WARN_ON(1);
}
int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
{
return -ENOSYS;
}
int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
{

View file

@ -296,10 +296,11 @@ static int __init is_alive(u_short sock)
return 0;
}
static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
unsigned int ioaddr)
static int add_pcc_socket(ulong base, int irq, ulong mapaddr,
unsigned int ioaddr)
{
pcc_socket_t *t = &socket[pcc_sockets];
int err;
/* add sockets */
t->ioaddr = ioaddr;
@ -328,11 +329,16 @@ static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
t->socket.irq_mask = 0;
t->socket.pci_irq = 2 + pcc_sockets; /* XXX */
request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt);
err = request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt);
if (err) {
if (t->base > 0)
release_region(t->base, 0x20);
return err;
}
pcc_sockets++;
return;
return 0;
}
@ -683,26 +689,29 @@ static int __init init_m32r_pcc(void)
return ret;
ret = platform_device_register(&pcc_device);
if (ret){
platform_driver_unregister(&pcc_driver);
return ret;
}
if (ret)
goto unreg_driv;
printk(KERN_INFO "m32r PCC probe:\n");
pcc_sockets = 0;
add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, 0x1000);
ret = add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE,
0x1000);
if (ret)
goto unreg_dev;
#ifdef CONFIG_M32RPCC_SLOT2
add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, 0x2000);
ret = add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE,
0x2000);
if (ret)
goto unreg_dev;
#endif
if (pcc_sockets == 0) {
printk("socket is not found.\n");
platform_device_unregister(&pcc_device);
platform_driver_unregister(&pcc_driver);
return -ENODEV;
ret = -ENODEV;
goto unreg_dev;
}
/* Set up interrupt handler(s) */
@ -728,6 +737,12 @@ static int __init init_m32r_pcc(void)
}
return 0;
unreg_dev:
platform_device_unregister(&pcc_device);
unreg_driv:
platform_driver_unregister(&pcc_driver);
return ret;
} /* init_m32r_pcc */
static void __exit exit_m32r_pcc(void)

View file

@ -254,7 +254,7 @@ static void __init intc_subgroup_map(struct intc_desc_int *d)
radix_tree_tag_clear(&d->tree, entry->enum_id,
INTC_TAG_VIRQ_NEEDS_ALLOC);
radix_tree_replace_slot((void **)entries[i],
radix_tree_replace_slot(&d->tree, (void **)entries[i],
&intc_irq_xlate[irq]);
}

View file

@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
goto end_coredump;
vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
if (!vma_filesz)
goto end_coredump;
@ -2311,7 +2313,7 @@ static int elf_core_dump(struct coredump_params *cprm)
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
kfree(vma_filesz);
vfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:

View file

@ -202,27 +202,31 @@ static struct ratelimit_state printk_limits[] = {
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
char lvl[4];
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1];
struct va_format vaf;
va_list args;
const char *type = logtypes[4];
const char *type = NULL;
int kern_level;
struct ratelimit_state *ratelimit;
va_start(args, fmt);
kern_level = printk_get_level(fmt);
if (kern_level) {
while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
memcpy(lvl, fmt, size);
lvl[size] = '\0';
if (kern_level >= '0' && kern_level <= '7') {
memcpy(lvl, fmt, size);
lvl[size] = '\0';
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
}
fmt += size;
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
} else {
}
if (!type) {
*lvl = '\0';
/* Default to debug output */
ratelimit = &printk_limits[7];
type = logtypes[4];
ratelimit = &printk_limits[4];
}
vaf.fmt = fmt;

View file

@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
radix_tree_replace_slot(slot, (void *)entry);
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
radix_tree_replace_slot(slot, (void *)entry);
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@ -643,12 +643,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
}
mapping->nrexceptional++;
} else {
struct radix_tree_node *node;
void **slot;
void *ret;
ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
radix_tree_replace_slot(slot, new_entry);
__radix_tree_replace(page_tree, node, slot,
new_entry, NULL, NULL);
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);

View file

@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
* become available for writeback. Otherwise
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);

View file

@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
loff_t pos, unsigned len, unsigned copied, void *fsdata)
{
int i, ret;
unsigned from, to, start = pos & (PAGE_SIZE - 1);
@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
int ret;
struct inode *inode = mapping->host;
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 1);
@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
dwc->dw_zero_count++;
}
ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
BUG_ON(ret != len);
ret = 0;
unlock:

View file

@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle,
struct buffer_head *bh));
int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
loff_t pos, unsigned len, unsigned copied, void *fsdata);
typedef enum {
OCFS2_WRITE_BUFFER = 0,

View file

@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
memset(hb_block, 0, reg->hr_block_bytes);
/* TODO: time stuff */
cputime = CURRENT_TIME.tv_sec;
cputime = ktime_get_real_seconds();
if (!cputime)
cputime = 1;

View file

@ -1609,8 +1609,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
__dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
// mlog(0, "mle was found\n");
set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@ -1625,8 +1623,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
response = DLM_MASTER_RESP_NO;
} else
response = DLM_MASTER_RESP_MAYBE;
if (set_maybe)
set_bit(request->node_idx, tmpmle->maybe_map);
set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
}
spin_unlock(&dlm->master_lock);
@ -1644,12 +1641,6 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
* dlm_assert_master_worker() isn't called, we drop it here.
*/
if (dispatch_assert) {
if (response != DLM_MASTER_RESP_YES)
mlog(ML_ERROR, "invalid response %d\n", response);
if (!res) {
mlog(ML_ERROR, "bad lockres while trying to assert!\n");
BUG();
}
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
spin_lock(&res->spinlock);

View file

@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->spinlock);
dlm_kick_recovery_thread(dlm);
break;
default:
BUG();
}
mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",

View file

@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_commit;
}
di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
ocfs2_journal_dirty(handle, di_bh);

View file

@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
*/
seqno++;
os->os_count++;
os->os_scantime = CURRENT_TIME;
os->os_scantime = ktime_get_seconds();
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
struct ocfs2_orphan_scan *os;
os = &osb->osb_orphan_scan;
os->os_scantime = CURRENT_TIME;
os->os_scantime = ktime_get_seconds();
if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {

View file

@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
ret = VM_FAULT_NOPAGE;
goto out;
}
ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
fsdata);
ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
BUG_ON(ret != len);
ret = VM_FAULT_LOCKED;
out:

View file

@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct ocfs2_extent_list *fel;
u16 feat;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct timespec64 ts;
*new_fe_bh = NULL;
@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
ktime_get_real_ts64(&ts);
fe->i_atime = fe->i_ctime = fe->i_mtime =
cpu_to_le64(CURRENT_TIME.tv_sec);
cpu_to_le64(ts.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
cpu_to_le32(CURRENT_TIME.tv_nsec);
cpu_to_le32(ts.tv_nsec);
fe->i_dtime = 0;
/*

View file

@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
struct timespec os_scantime; /* time this node ran the scan */
time64_t os_scantime; /* time this node ran the scan */
u32 os_count; /* tracks node specific scans */
u32 os_seqno; /* tracks cluster wide scans */
atomic_t os_state; /* ACTIVE or INACTIVE */

View file

@ -478,7 +478,6 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
if (ret) {
mlog_errno(ret);
ocfs2_unlock_refcount_tree(osb, tree, rw);
ocfs2_refcount_tree_put(tree);
goto out;
}

View file

@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
out += snprintf(buf + out, len - out, "Disabled\n");
else
out += snprintf(buf + out, len - out, "%lu seconds ago\n",
(get_seconds() - os->os_scantime.tv_sec));
(unsigned long)(ktime_get_seconds() - os->os_scantime));
out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");

View file

@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
if (sigismember(set, i+2)) x |= 2;
if (sigismember(set, i+3)) x |= 4;
if (sigismember(set, i+4)) x |= 8;
seq_printf(m, "%x", x);
seq_putc(m, hex_asc[x]);
} while (i >= 4);
seq_putc(m, '\n');
@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
seq_putc(m, '\n');
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
seq_putc(m, '\n');
}
static inline void task_context_switch_counts(struct seq_file *m,

View file

@ -104,9 +104,12 @@
* in /proc for a task before it execs a suid executable.
*/
static u8 nlink_tid;
static u8 nlink_tgid;
struct pid_entry {
const char *name;
int len;
unsigned int len;
umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
@ -139,13 +142,13 @@ struct pid_entry {
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
unsigned int n)
{
unsigned int i;
unsigned int count;
count = 0;
count = 2;
for (i = 0; i < n; ++i) {
if (S_ISDIR(entries[i].mode))
++count;
@ -1967,7 +1970,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
struct map_files_info {
fmode_t mode;
unsigned long len;
unsigned int len;
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
@ -2412,14 +2415,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
last = &ents[nents - 1];
for (p = ents; p <= last; p++) {
last = &ents[nents];
for (p = ents; p < last; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len))
break;
}
if (p > last)
if (p >= last)
goto out;
error = proc_pident_instantiate(dir, dentry, task, p);
@ -2444,7 +2447,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
if (ctx->pos >= nents + 2)
goto out;
for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
if (!proc_fill_cache(file, ctx, p->name, p->len,
proc_pident_instantiate, task, p))
break;
@ -3068,8 +3071,7 @@ static int proc_pid_instantiate(struct inode *dir,
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
ARRAY_SIZE(tgid_base_stuff)));
set_nlink(inode, nlink_tgid);
d_set_d_op(dentry, &pid_dentry_operations);
@ -3361,8 +3363,7 @@ static int proc_task_instantiate(struct inode *dir,
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
ARRAY_SIZE(tid_base_stuff)));
set_nlink(inode, nlink_tid);
d_set_d_op(dentry, &pid_dentry_operations);
@ -3552,3 +3553,9 @@ static const struct file_operations proc_task_operations = {
.iterate_shared = proc_task_readdir,
.llseek = generic_file_llseek,
};
void __init set_proc_pid_nlink(void)
{
nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}

View file

@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
/* pde is locked */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
/*
* close() (proc_reg_release()) can't delete an entry and proceed:
* ->release hook needs to be available at the right moment.
*
* rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
* "struct file" needs to be available at the right moment.
*
* Therefore, first process to enter this function does ->release() and
* signals its completion to the other process which does nothing.
*/
if (pdeo->closing) {
/* somebody else is doing that, just wait */
DECLARE_COMPLETION_ONSTACK(c);
@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
pdeo->closing = 1;
pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
pde->proc_fops->release(file_inode(file), file);
spin_lock(&pde->pde_unload_lock);
list_del_init(&pdeo->lh);
/* After ->release. */
list_del(&pdeo->lh);
if (pdeo->c)
complete(pdeo->c);
kfree(pdeo);
@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
if (atomic_add_return(BIAS, &de->in_use) != BIAS)
wait_for_completion(&c);
/* ->pde_openers list can't grow from now on. */
spin_lock(&de->pde_unload_lock);
while (!list_empty(&de->pde_openers)) {
struct pde_opener *pdeo;
@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
/*
* What for, you ask? Well, we can have open, rmmod, remove_proc_entry
* sequence. ->release won't be called because ->proc_fops will be
* cleared. Depending on complexity of ->release, consequences vary.
* Ensure that
* 1) PDE's ->release hook will be called no matter what
* either normally by close()/->release, or forcefully by
* rmmod/remove_proc_entry.
*
* We can't wait for mercy when close will be done for real, it's
* deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
* by hand in remove_proc_entry(). For this, save opener's credentials
* for later.
* 2) rmmod isn't blocked by opening file in /proc and sitting on
* the descriptor (including "rmmod foo </proc/foo" scenario).
*
* Save every "struct file" with custom ->release hook.
*/
pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (rv == 0 && release) {
/* To know what to release. */
pdeo->file = file;
/* Strictly for "too late" ->release in proc_reg_release(). */
pdeo->closing = false;
pdeo->c = NULL;
spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
spin_unlock(&pde->pde_unload_lock);

View file

@ -203,7 +203,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
struct pde_opener {
struct file *file;
struct list_head lh;
int closing;
bool closing;
struct completion *c;
};
extern const struct inode_operations proc_link_inode_operations;
@ -211,6 +211,7 @@ extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);

View file

@ -122,6 +122,7 @@ void __init proc_root_init(void)
int err;
proc_init_inodecache();
set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type);
if (err)
return;

View file

@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
cond_resched();
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE

View file

@ -652,18 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
}
#endif
#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl)
{
/*
* With split pmd lock we also need to move preallocated
* PTE page table if new_pmd is on different PMD page table.
*/
return new_pmd_ptl != old_pmd_ptl;
}
#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif
/*
* This function is meant to be used by sites walking pagetables with
* the mmap_sem hold in read mode to protect against MADV_DONTNEED and

View file

@ -107,11 +107,6 @@ struct mmu_gather {
struct mmu_gather_batch local;
struct page *__pages[MMU_GATHER_BUNDLE];
unsigned int batch_count;
/*
* __tlb_adjust_range will track the new addr here,
* that that we can adjust the range after the flush
*/
unsigned long addr;
int page_size;
};
@ -125,16 +120,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
int page_size);
static inline void __tlb_adjust_range(struct mmu_gather *tlb,
unsigned long address)
unsigned long address,
unsigned int range_size)
{
tlb->start = min(tlb->start, address);
tlb->end = max(tlb->end, address + PAGE_SIZE);
/*
* Track the last address with which we adjusted the range. This
* will be used later to adjust again after a mmu_flush due to
* failed __tlb_remove_page
*/
tlb->addr = address;
tlb->end = max(tlb->end, address + range_size);
}
static inline void __tlb_reset_range(struct mmu_gather *tlb)
@ -150,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
static inline void tlb_remove_page_size(struct mmu_gather *tlb,
struct page *page, int page_size)
{
if (__tlb_remove_page_size(tlb, page, page_size)) {
if (__tlb_remove_page_size(tlb, page, page_size))
tlb_flush_mmu(tlb);
tlb->page_size = page_size;
__tlb_adjust_range(tlb, tlb->addr);
__tlb_remove_page_size(tlb, page, page_size);
}
}
static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
}
@ -172,14 +158,21 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}
static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page)
#ifndef tlb_remove_check_page_size_change
#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
unsigned int page_size)
{
/* active->nr should be zero when we call this */
VM_BUG_ON_PAGE(tlb->active->nr, page);
tlb->page_size = PAGE_SIZE;
__tlb_adjust_range(tlb, tlb->addr);
return __tlb_remove_page(tlb, page);
/*
* We don't care about page size change, just update
* mmu_gather page size here so that debug checks
* doesn't throw false warning.
*/
#ifdef CONFIG_DEBUG_VM
tlb->page_size = page_size;
#endif
}
#endif
/*
* In the case of tlb vma handling, we can optimise these away in the
@ -215,10 +208,16 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
*/
#define tlb_remove_tlb_entry(tlb, ptep, address) \
do { \
__tlb_adjust_range(tlb, address); \
__tlb_adjust_range(tlb, address, PAGE_SIZE); \
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
do { \
__tlb_adjust_range(tlb, address, huge_page_size(h)); \
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
/**
* tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
* This is a nop so far, because only x86 needs it.
@ -227,29 +226,47 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif
#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \
do { \
__tlb_adjust_range(tlb, address); \
__tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \
do { \
__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \
__tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
} while (0)
/*
* For things like page tables caches (ie caching addresses "inside" the
* page tables, like x86 does), for legacy reasons, flushing an
* individual page had better flush the page table caches behind it. This
* is definitely how x86 works, for example. And if you have an
* architected non-legacy page table cache (which I'm not aware of
* anybody actually doing), you're going to have some architecturally
* explicit flushing for that, likely *separate* from a regular TLB entry
* flush, and thus you'd need more than just some range expansion..
*
* So if we ever find an architecture
* that would want something that odd, I think it is up to that
* architecture to do its own odd thing, not cause pain for others
* http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
*
* For now w.r.t page table cache, mark the range_size as PAGE_SIZE
*/
#define pte_free_tlb(tlb, ptep, address) \
do { \
__tlb_adjust_range(tlb, address); \
__tlb_adjust_range(tlb, address, PAGE_SIZE); \
__pte_free_tlb(tlb, ptep, address); \
} while (0)
#ifndef __ARCH_HAS_4LEVEL_HACK
#define pud_free_tlb(tlb, pudp, address) \
do { \
__tlb_adjust_range(tlb, address); \
__tlb_adjust_range(tlb, address, PAGE_SIZE); \
__pud_free_tlb(tlb, pudp, address); \
} while (0)
#endif
#define pmd_free_tlb(tlb, pmdp, address) \
do { \
__tlb_adjust_range(tlb, address); \
__tlb_adjust_range(tlb, address, PAGE_SIZE); \
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)

View file

@ -136,12 +136,13 @@ struct bdi_writeback {
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_SIZE units */
unsigned int capabilities; /* Device capabilities */
unsigned long io_pages; /* max allowed IO size */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
char *name;
unsigned int capabilities; /* Device capabilities */
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;

View file

@ -1,6 +1,9 @@
#ifndef __CMA_H__
#define __CMA_H__
#include <linux/init.h>
#include <linux/types.h>
/*
* There is always at least global CMA area and a few optional
* areas configured in kernel .config.

View file

@ -21,7 +21,7 @@
* clobbered. The issue is as follows: while the inline asm might
* access any memory it wants, the compiler could have fit all of
* @ptr into memory registers instead, and since @ptr never escaped
* from that, it proofed that the inline asm wasn't touching any of
* from that, it proved that the inline asm wasn't touching any of
* it. This version works well with both compilers, i.e. we're telling
* the compiler that the inline asm absolutely may see the contents
* of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495

View file

@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct page *page) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address, bool freeze, struct page *page) {}

View file

@ -175,7 +175,7 @@ __printf(2, 3)
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...);
struct kthread_worker *
__printf(3, 4) struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
const char namefmt[], ...);

View file

@ -7,6 +7,7 @@
#include <linux/mmzone.h>
#include <linux/dax.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
@ -177,6 +178,13 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
return false;
/*
* DAX device mappings require predictable access latency, so avoid
* incurring periodic faults.
*/
if (vma_is_dax(vma))
return false;
#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
if (vma->vm_flags & VM_HUGETLB)
return false;

View file

@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void);
extern void early_init_fdt_scan_reserved_mem(void);
extern void early_init_fdt_reserve_self(void);
extern void early_init_dt_add_memory_arch(u64 base, u64 size);
extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size);
extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
bool no_map);
extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);

View file

@ -10,6 +10,8 @@
extern const char linux_banner[];
extern const char linux_proc_banner[];
#define PRINTK_MAX_SINGLE_HEADER_LEN 2
static inline int printk_get_level(const char *buffer)
{
if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@ -31,6 +33,14 @@ static inline const char *printk_skip_level(const char *buffer)
return buffer;
}
static inline const char *printk_skip_headers(const char *buffer)
{
while (printk_get_level(buffer))
buffer = printk_skip_level(buffer);
return buffer;
}
#define CONSOLE_EXT_LOG_MAX 8192
/* printk's without a loglevel use this.. */
@ -40,10 +50,15 @@ static inline const char *printk_skip_level(const char *buffer)
#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */
#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */
/*
* Default used to be hard-coded at 7, we're now allowing it to be set from
* kernel config.
*/
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
extern int console_printk[];
#define console_loglevel (console_printk[0])

View file

@ -80,14 +80,11 @@ static inline bool radix_tree_is_internal_node(void *ptr)
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
/* Internally used bits of node->count */
#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
struct radix_tree_node {
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
unsigned int count;
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
unsigned char count; /* Total entry count */
unsigned char exceptional; /* Exceptional entry count */
union {
struct {
/* Used when ascending tree */
@ -248,20 +245,6 @@ static inline int radix_tree_exception(void *arg)
return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}
/**
* radix_tree_replace_slot - replace item in a slot
* @pslot: pointer to slot, returned by radix_tree_lookup_slot
* @item: new item to store in the slot.
*
* For use with radix_tree_lookup_slot(). Caller must hold tree write locked
* across slot lookup and replacement.
*/
static inline void radix_tree_replace_slot(void **pslot, void *item)
{
BUG_ON(radix_tree_is_internal_node(item));
rcu_assign_pointer(*pslot, item);
}
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
unsigned order, struct radix_tree_node **nodep,
void ***slotp);
@ -276,7 +259,14 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
bool __radix_tree_delete_node(struct radix_tree_root *root,
typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
void **slot, void *item,
radix_tree_update_node_t update_node, void *private);
void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item);
void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);

View file

@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
* anon_vma helper functions.
*/
void anon_vma_init(void); /* create anon_vma_cachep */
int anon_vma_prepare(struct vm_area_struct *);
int __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
if (likely(vma->anon_vma))
return 0;
return __anon_vma_prepare(vma);
}
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{

View file

@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm)
/* leave room for more dump flags */
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
/*
* This one-shot flag is dropped due to necessity of changing exe once again
* on NFS restore
*/
//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */

View file

@ -246,39 +246,7 @@ struct swap_info_struct {
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
void workingset_activation(struct page *page);
extern struct list_lru workingset_shadow_nodes;
static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
{
return node->count & RADIX_TREE_COUNT_MASK;
}
static inline void workingset_node_pages_inc(struct radix_tree_node *node)
{
node->count++;
}
static inline void workingset_node_pages_dec(struct radix_tree_node *node)
{
VM_WARN_ON_ONCE(!workingset_node_pages(node));
node->count--;
}
static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
{
return node->count >> RADIX_TREE_COUNT_SHIFT;
}
static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
{
node->count += 1U << RADIX_TREE_COUNT_SHIFT;
}
static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
{
VM_WARN_ON_ONCE(!workingset_node_shadows(node));
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
}
void workingset_update_node(struct radix_tree_node *node, void *private);
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;

View file

@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller);
extern void vfree(const void *addr);
extern void vfree_atomic(const void *addr);
extern void *vmap(struct page **pages, unsigned int count,
unsigned long flags, pgprot_t prot);

View file

@ -588,7 +588,7 @@ void __init prepare_namespace(void)
saved_root_name);
while (driver_probe_done() != 0 ||
(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
msleep(100);
msleep(5);
async_synchronize_full();
}

View file

@ -697,7 +697,7 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
* Write to all consoles.
*/
retlen = strlen(kdb_buffer);
cp = (char *) printk_skip_level(kdb_buffer);
cp = (char *) printk_skip_headers(kdb_buffer);
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
} else {

View file

@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
}
local_irq_restore(flags);
vfree(tsk->stack);
vfree_atomic(tsk->stack);
return;
}
#endif

View file

@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* complain:
*/
if (sysctl_hung_task_warnings) {
sysctl_hung_task_warnings--;
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
t->comm, t->pid, timeout);
pr_err(" %s %s %.*s\n",

View file

@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create)
}
}
static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
void *data, int node,
const char namefmt[],
va_list args)
@ -635,7 +636,7 @@ int kthread_worker_fn(void *worker_ptr)
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
static struct kthread_worker *
static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
const char namefmt[], va_list args)
{

View file

@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args)
again:
len = atomic_read(&s->len);
if (len >= sizeof(s->buffer)) {
/* The trailing '\0' is not counted into len. */
if (len >= sizeof(s->buffer) - 1) {
atomic_inc(&nmi_message_lost);
return 0;
}
@ -79,7 +80,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
if (!len)
smp_rmb();
add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
/*
* Do it once again if the buffer has been flushed in the meantime.
@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len)
}
/*
* printk one line from the temporary buffer from @start index until
* and including the @end index.
*/
static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
int start, int end)
/* printk part of the temporary buffer line by line */
static int printk_nmi_flush_buffer(const char *start, size_t len)
{
const char *buf = s->buffer + start;
const char *c, *end;
bool header;
printk_nmi_flush_line(buf, (end - start) + 1);
c = start;
end = start + len;
header = true;
/* Print line by line. */
while (c < end) {
if (*c == '\n') {
printk_nmi_flush_line(start, c - start + 1);
start = ++c;
header = true;
continue;
}
/* Handle continuous lines or missing new line. */
if ((c + 1 < end) && printk_get_level(c)) {
if (header) {
c = printk_skip_level(c);
continue;
}
printk_nmi_flush_line(start, c - start);
start = c++;
header = true;
continue;
}
header = false;
c++;
}
/* Check if there was a partial line. Ignore pure header. */
if (start < end && !header) {
static const char newline[] = KERN_CONT "\n";
printk_nmi_flush_line(start, end - start);
printk_nmi_flush_line(newline, strlen(newline));
}
return len;
}
/*
@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work)
__RAW_SPIN_LOCK_INITIALIZER(read_lock);
struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
unsigned long flags;
size_t len, size;
int i, last_i;
size_t len;
int i;
/*
* The lock has two functions. First, one reader has to flush all
@ -154,12 +190,14 @@ static void __printk_nmi_flush(struct irq_work *work)
/*
* This is just a paranoid check that nobody has manipulated
* the buffer an unexpected way. If we printed something then
* @len must only increase.
* @len must only increase. Also it should never overflow the
* buffer size.
*/
if (i && i >= len) {
if ((i && i >= len) || len > sizeof(s->buffer)) {
const char *msg = "printk_nmi_flush: internal error\n";
printk_nmi_flush_line(msg, strlen(msg));
len = 0;
}
if (!len)
@ -167,22 +205,7 @@ static void __printk_nmi_flush(struct irq_work *work)
/* Make sure that data has been written up to the @len */
smp_rmb();
size = min(len, sizeof(s->buffer));
last_i = i;
/* Print line by line. */
for (; i < size; i++) {
if (s->buffer[i] == '\n') {
printk_nmi_flush_seq_line(s, last_i, i);
last_i = i + 1;
}
}
/* Check if there was a partial line. */
if (last_i < size) {
printk_nmi_flush_seq_line(s, last_i, size - 1);
printk_nmi_flush_line("\n", strlen("\n"));
}
i += printk_nmi_flush_buffer(s->buffer + i, len - i);
/*
* Check that nothing has got added in the meantime and truncate

View file

@ -1697,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
fput(exe_file);
}
/*
* The symlink can be changed only once, just to disallow arbitrary
* transitions malicious software might bring in. This means one
* could make a snapshot over all processes running and monitor
* /proc/pid/exe changes to notice unusual activity if needed.
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
goto exit;
err = 0;
/* set the new file, lockless */
get_file(exe.file);

View file

@ -15,6 +15,21 @@ config PRINTK_TIME
The behavior is also controlled by the kernel command line
parameter printk.time=1. See Documentation/kernel-parameters.txt
config CONSOLE_LOGLEVEL_DEFAULT
int "Default console loglevel (1-15)"
range 1 15
default "7"
help
Default loglevel to determine what will be printed on the console.
Setting a default here is equivalent to passing in loglevel=<x> in
the kernel bootargs. loglevel=<x> continues to override whatever
value is specified here as well.
Note: This does not affect the log level of un-prefixed prink()
usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
option.
config MESSAGE_LOGLEVEL_DEFAULT
int "Default message log level (1-7)"
range 1 7
@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
that are auditing their logs closely may want to set it to a lower
priority.
Note: This does not affect what message level gets printed on the console
by default. To change that, use loglevel=<x> in the kernel bootargs,
or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
config BOOT_PRINTK_DELAY
bool "Delay each boot printk message by N milliseconds"
depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@ -1986,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
config STRICT_DEVMEM
bool "Filter access to /dev/mem"
depends on MMU
depends on MMU && DEVMEM
depends on ARCH_HAS_DEVMEM_IS_ALLOWED
default y if TILE || PPC
---help---

View file

@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
* and go back to the ida_pre_get() call. If the ida is full, it will
* return %-ENOSPC.
*
* Note that callers must ensure that concurrent access to @ida is not possible.
* See ida_simple_get() for a varaint which takes care of locking.
*
* @p_id returns a value in the range @starting_id ... %0x7fffffff.
*/
int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
* Allocates an id in the range start <= id < end, or returns -ENOSPC.
* On memory allocation failure, returns -ENOMEM.
*
* Compared to ida_get_new_above() this function does its own locking, and
* should be used unless there are special requirements.
*
* Use ida_simple_remove() to get rid of an id.
*/
int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
* ida_simple_remove - remove an allocated id.
* @ida: the (initialized) ida.
* @id: the id returned by ida_simple_get.
*
* Use to release an id allocated with ida_simple_get().
*
* Compared to ida_remove() this function does its own locking, and should be
* used unless there are special requirements.
*/
void ida_simple_remove(struct ida *ida, unsigned int id)
{

View file

@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
{
unsigned long i;
pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
node, node->offset,
node->tags[0][0], node->tags[1][0], node->tags[2][0],
node->shift, node->count, node->parent);
node->shift, node->count, node->exceptional, node->parent);
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
unsigned long first = index | (i << node->shift);
@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
tag_clear(node, i, 0);
node->slots[0] = NULL;
node->count = 0;
kmem_cache_free(radix_tree_node_cachep, node);
}
@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
node->offset = 0;
node->count = 1;
node->parent = NULL;
if (radix_tree_is_internal_node(slot))
if (radix_tree_is_internal_node(slot)) {
entry_to_node(slot)->parent = node;
} else {
/* Moving an exceptional root->rnode to a node */
if (radix_tree_exceptional_entry(slot))
node->exceptional = 1;
}
node->slots[0] = slot;
slot = node_to_entry(node);
rcu_assign_pointer(root->rnode, slot);
@ -533,6 +537,104 @@ static int radix_tree_extend(struct radix_tree_root *root,
return maxshift + RADIX_TREE_MAP_SHIFT;
}
/**
* radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
static inline void radix_tree_shrink(struct radix_tree_root *root,
radix_tree_update_node_t update_node,
void *private)
{
for (;;) {
struct radix_tree_node *node = root->rnode;
struct radix_tree_node *child;
if (!radix_tree_is_internal_node(node))
break;
node = entry_to_node(node);
/*
* The candidate node has more than one child, or its child
* is not at the leftmost slot, or the child is a multiorder
* entry, we cannot shrink.
*/
if (node->count != 1)
break;
child = node->slots[0];
if (!child)
break;
if (!radix_tree_is_internal_node(child) && node->shift)
break;
if (radix_tree_is_internal_node(child))
entry_to_node(child)->parent = NULL;
/*
* We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
* (node->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
root->rnode = child;
/*
* We have a dilemma here. The node's slot[0] must not be
* NULLed in case there are concurrent lookups expecting to
* find the item. However if this was a bottom-level node,
* then it may be subject to the slot pointer being visible
* to callers dereferencing it. If item corresponding to
* slot[0] is subsequently deleted, these callers would expect
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
* the page pointer, and if the page has 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
* problem (replacing direct root node with an indirect pointer
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
node->count = 0;
if (!radix_tree_is_internal_node(child)) {
node->slots[0] = RADIX_TREE_RETRY;
if (update_node)
update_node(node, private);
}
radix_tree_node_free(node);
}
}
static void delete_node(struct radix_tree_root *root,
struct radix_tree_node *node,
radix_tree_update_node_t update_node, void *private)
{
do {
struct radix_tree_node *parent;
if (node->count) {
if (node == entry_to_node(root->rnode))
radix_tree_shrink(root, update_node, private);
return;
}
parent = node->parent;
if (parent) {
parent->slots[node->offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
root->rnode = NULL;
}
radix_tree_node_free(node);
node = parent;
} while (node);
}
/**
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
if (node) {
unsigned offset = get_slot_offset(node, slot);
node->count++;
if (radix_tree_exceptional_entry(item))
node->exceptional++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
BUG_ON(tag_get(node, 2, offset));
@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);
static void replace_slot(struct radix_tree_root *root,
struct radix_tree_node *node,
void **slot, void *item,
bool warn_typeswitch)
{
void *old = rcu_dereference_raw(*slot);
int count, exceptional;
WARN_ON_ONCE(radix_tree_is_internal_node(item));
count = !!item - !!old;
exceptional = !!radix_tree_exceptional_entry(item) -
!!radix_tree_exceptional_entry(old);
WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
if (node) {
node->count += count;
node->exceptional += exceptional;
}
rcu_assign_pointer(*slot, item);
}
/**
* __radix_tree_replace - replace item in a slot
* @root: radix tree root
* @node: pointer to tree node
* @slot: pointer to slot in @node
* @item: new item to store in the slot.
* @update_node: callback for changing leaf nodes
* @private: private data to pass to @update_node
*
* For use with __radix_tree_lookup(). Caller must hold tree write locked
* across slot lookup and replacement.
*/
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
void **slot, void *item,
radix_tree_update_node_t update_node, void *private)
{
/*
* This function supports replacing exceptional entries and
* deleting entries, but that needs accounting against the
* node unless the slot is root->rnode.
*/
replace_slot(root, node, slot, item,
!node && slot != (void **)&root->rnode);
if (!node)
return;
if (update_node)
update_node(node, private);
delete_node(root, node, update_node, private);
}
/**
* radix_tree_replace_slot - replace item in a slot
* @root: radix tree root
* @slot: pointer to slot
* @item: new item to store in the slot.
*
* For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
* radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
* across slot lookup and replacement.
*
* NOTE: This cannot be used to switch between non-entries (empty slots),
* regular entries, and exceptional entries, as that requires accounting
* inside the radix tree node. When switching from one type of entry or
* deleting, use __radix_tree_lookup() and __radix_tree_replace().
*/
void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item)
{
replace_slot(root, NULL, slot, item, true);
}
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
@ -1393,75 +1576,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
}
#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
* radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
bool shrunk = false;
for (;;) {
struct radix_tree_node *node = root->rnode;
struct radix_tree_node *child;
if (!radix_tree_is_internal_node(node))
break;
node = entry_to_node(node);
/*
* The candidate node has more than one child, or its child
* is not at the leftmost slot, or the child is a multiorder
* entry, we cannot shrink.
*/
if (node->count != 1)
break;
child = node->slots[0];
if (!child)
break;
if (!radix_tree_is_internal_node(child) && node->shift)
break;
if (radix_tree_is_internal_node(child))
entry_to_node(child)->parent = NULL;
/*
* We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another: if it
* was safe to dereference the old pointer to it
* (node->slots[0]), it will be safe to dereference the new
* one (root->rnode) as far as dependent read barriers go.
*/
root->rnode = child;
/*
* We have a dilemma here. The node's slot[0] must not be
* NULLed in case there are concurrent lookups expecting to
* find the item. However if this was a bottom-level node,
* then it may be subject to the slot pointer being visible
* to callers dereferencing it. If item corresponding to
* slot[0] is subsequently deleted, these callers would expect
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
* the page pointer, and if the page has 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
* problem (replacing direct root node with an indirect pointer
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
if (!radix_tree_is_internal_node(child))
node->slots[0] = RADIX_TREE_RETRY;
radix_tree_node_free(node);
shrunk = true;
}
return shrunk;
}
/**
* __radix_tree_delete_node - try to free node after clearing a slot
* @root: radix tree root
@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
* After clearing the slot at @index in @node from radix tree
* rooted at @root, call this function to attempt freeing the
* node and shrinking the tree.
*
* Returns %true if @node was freed, %false otherwise.
*/
bool __radix_tree_delete_node(struct radix_tree_root *root,
void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
bool deleted = false;
do {
struct radix_tree_node *parent;
if (node->count) {
if (node == entry_to_node(root->rnode))
deleted |= radix_tree_shrink(root);
return deleted;
}
parent = node->parent;
if (parent) {
parent->slots[node->offset] = NULL;
parent->count--;
} else {
root_tag_clear_all(root);
root->rnode = NULL;
}
radix_tree_node_free(node);
deleted = true;
node = parent;
} while (node);
return deleted;
delete_node(root, node, NULL, NULL);
}
static inline void delete_sibling_entries(struct radix_tree_node *node,
@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
node_tag_clear(root, node, tag, offset);
delete_sibling_entries(node, node_to_entry(slot), offset);
node->slots[offset] = NULL;
node->count--;
__radix_tree_delete_node(root, node);
__radix_tree_replace(root, node, slot, NULL, NULL, NULL);
return entry;
}

View file

@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
*
* (p) (p)
* / \ / \
* N S --> N Sl
* N S --> N sl
* / \ \
* sl Sr s
* sl Sr S
* \
* Sr
*
* Note: p might be red, and then both
* p and sl are red after rotation(which
* breaks property 4). This is fixed in
* Case 4 (in __rb_rotate_set_parents()
* which set sl the color of p
* and set p RB_BLACK)
*
* (p) (sl)
* / \ / \
* N sl --> P S
* \ / \
* S N Sr
* \
* Sr
*/
tmp1 = tmp2->rb_right;
WRITE_ONCE(sibling->rb_left, tmp1);
@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
}
break;
}
/* Case 3 - right rotate at sibling */
/* Case 3 - left rotate at sibling */
tmp1 = tmp2->rb_left;
WRITE_ONCE(sibling->rb_right, tmp1);
WRITE_ONCE(tmp2->rb_left, sibling);
@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
tmp1 = sibling;
sibling = tmp2;
}
/* Case 4 - left rotate at parent + color flips */
/* Case 4 - right rotate at parent + color flips */
tmp2 = sibling->rb_right;
WRITE_ONCE(parent->rb_left, tmp2);
WRITE_ONCE(sibling->rb_right, parent);

View file

@ -153,7 +153,7 @@ config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
depends on X86_64
depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
depends on NUMA
default n
help
@ -447,13 +447,9 @@ choice
benefit.
endchoice
#
# We don't deposit page tables on file THP mapping,
# but Power makes use of them to address MMU quirk.
#
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE && !PPC
depends on TRANSPARENT_HUGEPAGE
#
# UP and nommu archs use km based percpu allocator

View file

@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc,
return pfn;
}
/* Update the number of anon and file isolated pages in the zone */
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
unsigned int count[2] = { 0, };
if (list_empty(&cc->migratepages))
return;
list_for_each_entry(page, &cc->migratepages, lru)
count[!!page_is_file_cache(page)]++;
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
inc_node_page_state(page,
NR_ISOLATED_ANON + page_is_file_cache(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
@ -902,7 +888,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
acct_isolated(zone, cc);
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
cc->last_migrated_pfn = 0;
@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
break;
}
acct_isolated(cc->zone, cc);
return pfn;
}
@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
}
/*
* Either we isolated something and proceed with migration. Or
@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
break;
}
acct_isolated(zone, cc);
/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;

View file

@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
if (reason)
pr_alert("page dumped because: %s\n", reason);

View file

@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!dax_mapping(mapping)) {
if (shadowp)
*shadowp = p;
if (node)
workingset_node_shadows_dec(node);
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
RADIX_DAX_ENTRY_LOCK));
/* DAX accounts exceptional entries as normal pages */
if (node)
workingset_node_pages_dec(node);
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter(mapping, page->index,
false);
}
}
radix_tree_replace_slot(slot, page);
__radix_tree_replace(&mapping->page_tree, node, slot, page,
workingset_update_node, mapping);
mapping->nrpages++;
if (node) {
workingset_node_pages_inc(node);
/*
* Don't track node that contains actual pages.
*
* Avoid acquiring the list_lru lock if already
* untracked. The list_empty() test is safe as
* node->private_list is protected by
* mapping->tree_lock.
*/
if (!list_empty(&node->private_list))
list_lru_del(&workingset_shadow_nodes,
&node->private_list);
}
return 0;
}
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
int i, nr;
/* hugetlb pages are represented by one entry in the radix tree */
nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index + i,
&node, &slot);
VM_BUG_ON_PAGE(!node && nr != 1, page);
radix_tree_clear_tags(&mapping->page_tree, node, slot);
if (!node) {
VM_BUG_ON_PAGE(nr != 1, page);
/*
* We need a node to properly account shadow
* entries. Don't plant any without. XXX
*/
shadow = NULL;
}
radix_tree_replace_slot(slot, shadow);
if (!node)
break;
workingset_node_pages_dec(node);
if (shadow)
workingset_node_shadows_inc(node);
else
if (__radix_tree_delete_node(&mapping->page_tree, node))
continue;
/*
* Track node that only contains shadow entries. DAX mappings
* contain no shadow entries and may contain other exceptional
* entries so skip those.
*
* Avoid acquiring the list_lru lock if already tracked.
* The list_empty() test is safe as node->private_list is
* protected by mapping->tree_lock.
*/
if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
list_empty(&node->private_list)) {
node->private_data = mapping;
list_lru_add(&workingset_shadow_nodes,
&node->private_list);
}
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
workingset_update_node, mapping);
}
if (shadow) {

View file

@ -632,7 +632,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i;
}
bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
static bool vma_permits_fault(struct vm_area_struct *vma,
unsigned int fault_flags)
{
bool write = !!(fault_flags & FAULT_FLAG_WRITE);
bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
@ -857,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
EXPORT_SYMBOL(get_user_pages_locked);
/*
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
* pass additional gup_flags as last parameter (like FOLL_HWPOISON).
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
* tsk, mm to be specified.
*
* NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
* caller if required (just like with __get_user_pages). "FOLL_GET",
* "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
* according to the parameters "pages", "write", "force"
* respectively.
* caller if required (just like with __get_user_pages). "FOLL_GET"
* is set implicitly if "pages" is non-NULL.
*/
__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* get_user_pages_unlocked(tsk, mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead, if the two parameters
* "tsk" and "mm" are respectively equal to current and current->mm,
* or if "force" shall be set to 1 (get_user_pages_fast misses the
* "force" parameter).
* get_user_pages_fast should be used instead if specific gup_flags
* (e.g. FOLL_FORCE) are not required.
*/
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)

View file

@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
}
static struct kobj_attribute use_zero_page_attr =
__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
&hpage_pmd_size_attr.attr,
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
&shmem_enabled_attr.attr,
#endif
@ -1323,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = tlb->mm;
bool ret = false;
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
goto out_unlocked;
@ -1378,12 +1390,23 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pte_free(mm, pgtable);
atomic_long_dec(&mm->nr_ptes);
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
@ -1399,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page(tlb, pmd_page(orig_pmd));
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
tlb_remove_page(tlb, pmd_page(orig_pmd));
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
@ -1417,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
atomic_long_dec(&tlb->mm->nr_ptes);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@ -1425,6 +1450,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl,
struct vm_area_struct *vma)
{
/*
* With split pmd lock we also need to move preallocated
* PTE page table if new_pmd is on different PMD page table.
*
* We also don't deposit and withdraw tables for file pages.
*/
return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@ -1462,8 +1502,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
vma_is_anonymous(vma)) {
if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@ -1589,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);

View file

@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
/*
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
tlb_remove_tlb_entry(tlb, ptep, address);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page, spinlock_t *ptl)
unsigned long address, pte_t *ptep,
struct page *pagecache_page, spinlock_t *ptl)
{
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
int ret = 0, outside_reserve = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
@ -3711,8 +3718,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
vma_end_reservation(h, vma, address);
}
ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@ -3733,7 +3739,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
}
spin_unlock(ptl);
@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page, ptl);
ret = hugetlb_cow(mm, vma, address, ptep,
pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
spin_lock(ptl);
ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));

View file

@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
qlist_init(from);
}
static void qlist_move(struct qlist_head *from, struct qlist_node *last,
struct qlist_head *to, size_t size)
{
if (unlikely(last == from->tail)) {
qlist_move_all(from, to);
return;
}
if (qlist_empty(to))
to->head = from->head;
else
to->tail->next = from->head;
to->tail = last;
from->head = last->next;
last->next = NULL;
from->bytes -= size;
to->bytes += size;
}
#define QUARANTINE_PERCPU_SIZE (1 << 20)
#define QUARANTINE_BATCHES \
(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
/*
* The object quarantine consists of per-cpu queues and a global queue,
@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
*/
static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
static struct qlist_head global_quarantine;
/* Round-robin FIFO array of batches. */
static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
static int quarantine_head;
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
static DEFINE_SPINLOCK(quarantine_lock);
/* Maximum size of the global queue. */
static unsigned long quarantine_size;
static unsigned long quarantine_max_size;
/*
* Target size of a batch in global_quarantine.
* Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
*/
static unsigned long quarantine_batch_size;
/*
* The fraction of physical memory the quarantine is allowed to occupy.
@ -124,9 +120,6 @@ static unsigned long quarantine_size;
*/
#define QUARANTINE_FRACTION 32
#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
#define QUARANTINE_PERCPU_SIZE (1 << 20)
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
if (unlikely(!qlist_empty(&temp))) {
spin_lock_irqsave(&quarantine_lock, flags);
qlist_move_all(&temp, &global_quarantine);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
READ_ONCE(quarantine_batch_size)) {
int new_tail;
new_tail = quarantine_tail + 1;
if (new_tail == QUARANTINE_BATCHES)
new_tail = 0;
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
spin_unlock_irqrestore(&quarantine_lock, flags);
}
}
void quarantine_reduce(void)
{
size_t new_quarantine_size, percpu_quarantines;
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
struct qlist_head to_free = QLIST_INIT;
size_t size_to_free = 0;
struct qlist_node *last;
if (likely(READ_ONCE(global_quarantine.bytes) <=
READ_ONCE(quarantine_size)))
if (likely(READ_ONCE(quarantine_size) <=
READ_ONCE(quarantine_max_size)))
return;
spin_lock_irqsave(&quarantine_lock, flags);
@ -214,24 +216,23 @@ void quarantine_reduce(void)
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
0 : new_quarantine_size - percpu_quarantines;
WRITE_ONCE(quarantine_size, new_quarantine_size);
new_quarantine_size = (total_size < percpu_quarantines) ?
0 : total_size - percpu_quarantines;
WRITE_ONCE(quarantine_max_size, new_quarantine_size);
/* Aim at consuming at most 1/2 of slots in quarantine. */
WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
2 * total_size / QUARANTINE_BATCHES));
last = global_quarantine.head;
while (last) {
struct kmem_cache *cache = qlink_to_cache(last);
size_to_free += cache->size;
if (!last->next || size_to_free >
global_quarantine.bytes - QUARANTINE_LOW_SIZE)
break;
last = last->next;
if (likely(quarantine_size > quarantine_max_size)) {
qlist_move_all(&global_quarantine[quarantine_head], &to_free);
WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
quarantine_head++;
if (quarantine_head == QUARANTINE_BATCHES)
quarantine_head = 0;
}
qlist_move(&global_quarantine, last, &to_free, size_to_free);
spin_unlock_irqrestore(&quarantine_lock, flags);
@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
void quarantine_remove_cache(struct kmem_cache *cache)
{
unsigned long flags;
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
on_each_cpu(per_cpu_remove_cache, cache, 1);
spin_lock_irqsave(&quarantine_lock, flags);
qlist_move_cache(&global_quarantine, &to_free, cache);
for (i = 0; i < QUARANTINE_BATCHES; i++)
qlist_move_cache(&global_quarantine[i], &to_free, cache);
spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);

View file

@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
if (panic_on_warn)
panic("panic_on_warn set ...\n");
kasan_enable_current();
}

View file

@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
bool deposited = false;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
/*
* now deposit the pgtable for arch that need it
* otherwise free it.
*/
if (arch_needs_pgtable_deposit()) {
/*
* The deposit should be visibile only after
* collapse is seen by others.
*/
smp_wmb();
pgtable_trans_huge_deposit(vma->vm_mm, pmd,
pmd_pgtable(_pmd));
deposited = true;
}
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
atomic_long_dec(&vma->vm_mm->nr_ptes);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
if (!deposited) {
atomic_long_dec(&vma->vm_mm->nr_ptes);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
}
}
}
i_mmap_unlock_write(mapping);
@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm,
spin_lock_irq(&mapping->tree_lock);
slot = radix_tree_lookup_slot(&mapping->page_tree, index);
VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
&mapping->tree_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
radix_tree_replace_slot(slot,
radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
slot = radix_tree_iter_next(&iter);
index++;
continue;
out_lru:
@ -1521,9 +1542,11 @@ static void collapse_shmem(struct mm_struct *mm,
if (!page || iter.index < page->index) {
if (!nr_none)
break;
/* Put holes back where they were */
radix_tree_replace_slot(slot, NULL);
nr_none--;
/* Put holes back where they were */
radix_tree_delete(&mapping->page_tree,
iter.index);
slot = radix_tree_iter_next(&iter);
continue;
}
@ -1532,11 +1555,13 @@ static void collapse_shmem(struct mm_struct *mm,
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
radix_tree_replace_slot(slot, page);
radix_tree_replace_slot(&mapping->page_tree,
slot, page);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
spin_lock_irq(&mapping->tree_lock);
slot = radix_tree_iter_next(&iter);
}
VM_BUG_ON(nr_none);
spin_unlock_irq(&mapping->tree_lock);

View file

@ -19,7 +19,7 @@
*
*
* For more information on the algorithm and kmemleak usage, please see
* Documentation/kmemleak.txt.
* Documentation/dev-tools/kmemleak.rst.
*
* Notes on locking
* ----------------

View file

@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {

View file

@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work {
struct work_struct work;
};
static struct workqueue_struct *memcg_kmem_cache_create_wq;
static void memcg_kmem_cache_create_func(struct work_struct *w)
{
struct memcg_kmem_cache_create_work *cw =
@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
schedule_work(&cw->work);
queue_work(memcg_kmem_cache_create_wq, &cw->work);
}
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@ -5774,6 +5776,17 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
#ifndef CONFIG_SLOB
/*
* Kmem cache creation is mostly done with the slab_mutex held,
* so use a special workqueue to avoid stalling all worker
* threads in case lots of cgroups are created simultaneously.
*/
memcg_kmem_cache_create_wq =
alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
BUG_ON(!memcg_kmem_cache_create_wq);
#endif
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);

View file

@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
if (!tlb->page_size)
tlb->page_size = page_size;
else {
if (page_size != tlb->page_size)
return true;
}
VM_WARN_ON(tlb->page_size != page_size);
batch = tlb->active;
/*
* Add the page and check if we are full. If so
* force a flush.
*/
batch->pages[batch->nr++] = page;
if (batch->nr == batch->max) {
if (!tlb_next_batch(tlb))
return true;
@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
}
VM_BUG_ON_PAGE(batch->nr > batch->max, page);
batch->pages[batch->nr++] = page;
return false;
}
@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
end -= PMD_SIZE;
if (addr > end - 1)
return;
/*
* We add page table cache pages with PAGE_SIZE,
* (see pte_free_tlb()), flush the tlb if we need
*/
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
struct page *pending_page = NULL;
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@ -1172,7 +1174,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
force_flush = 1;
pending_page = page;
addr += PAGE_SIZE;
break;
}
@ -1213,11 +1214,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (force_flush) {
force_flush = 0;
tlb_flush_mmu_free(tlb);
if (pending_page) {
/* remove the page with new size */
__tlb_remove_pte_page(tlb, pending_page);
pending_page = NULL;
}
if (addr != end)
goto again;
}
@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
if (next - addr != HPAGE_PMD_SIZE) {
VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
split_huge_pmd(vma, pmd, addr);
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
static void deposit_prealloc_pte(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
fe->prealloc_pte = 0;
}
static int do_set_pmd(struct fault_env *fe, struct page *page)
{
struct vm_area_struct *vma = fe->vma;
@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
ret = VM_FAULT_FALLBACK;
page = compound_head(page);
/*
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
if (!fe->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
if (unlikely(!pmd_none(*fe->pmd)))
goto out;
@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
deposit_prealloc_pte(fe);
set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
ret = 0;
count_vm_event(THP_FILE_MAPPED);
out:
/*
* If we are going to fallback to pte mapping, do a
* withdraw with pmd lock held.
*/
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
fe->pmd);
spin_unlock(fe->ptl);
return ret;
}
@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
ret = do_set_pmd(fe, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
goto fault_handled;
}
if (!fe->pte) {
ret = pte_alloc_one_map(fe);
if (ret)
return ret;
goto fault_handled;
}
/* Re-check under ptl */
if (unlikely(!pte_none(*fe->pte)))
return VM_FAULT_NOPAGE;
if (unlikely(!pte_none(*fe->pte))) {
ret = VM_FAULT_NOPAGE;
goto fault_handled;
}
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, fe->address, fe->pte);
ret = 0;
return 0;
fault_handled:
/* preallocated pagetable is unused: free it */
if (fe->prealloc_pte) {
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
fe->prealloc_pte = 0;
}
return ret;
}
static unsigned long fault_around_bytes __read_mostly =
@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
/* preallocated pagetable is unused: free it */
if (fe->prealloc_pte) {
pte_free(fe->vma->vm_mm, fe->prealloc_pte);
fe->prealloc_pte = 0;
}
/* Huge page is mapped? Page fault is solved */
if (pmd_trans_huge(*fe->pmd)) {
ret = VM_FAULT_NOPAGE;
@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
split_huge_pmd(fe->vma, fe->pmd, fe->address);
__split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
return VM_FAULT_FALLBACK;
}

View file

@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
static int __init cmdline_parse_movable_node(char *p)
{
#ifdef CONFIG_MOVABLE_NODE
/*
* Memory used by the kernel cannot be hot-removed because Linux
* cannot migrate the kernel pages. When memory hotplug is
* enabled, we should prevent memblock from allocating memory
* for the kernel.
*
* ACPI SRAT records all hotpluggable memory ranges. But before
* SRAT is parsed, we don't know about it.
*
* The kernel image is loaded into memory at very early time. We
* cannot prevent this anyway. So on NUMA system, we set any
* node the kernel resides in as un-hotpluggable.
*
* Since on modern servers, one node could have double-digit
* gigabytes memory, we can assume the memory around the kernel
* image is also un-hotpluggable. So before SRAT is parsed, just
* allocate memory near the kernel image to try the best to keep
* the kernel away from hotpluggable memory.
*/
memblock_set_bottom_up(true);
movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");

View file

@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes))
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, addr);
__split_huge_pmd(vma, pmd, addr, false, NULL);
} else {
get_page(page);
spin_unlock(ptl);
@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
int nd)
{
switch (policy->mode) {
case MPOL_PREFERRED:
if (!(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
break;
case MPOL_BIND:
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
else {
/*
* Normally, MPOL_BIND allocations are node-local within the
* allowed nodemask. However, if __GFP_THISNODE is set and the
* current node isn't part of the mask, we use the zonelist for
* the first node in the mask instead.
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
default:
BUG();
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
return node_zonelist(nd, gfp);
}

View file

@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l)
continue;
}
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/*
* We isolated non-lru movable page so here we can use
* __PageMovable because LRU page's mapping cannot have
@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
put_page(page);
} else {
putback_lru_page(page);
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
}
}
}
@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
radix_tree_replace_slot(pslot, newpage);
radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
/*
* Drop cache reference from old page by unfreezing
@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
@ -1121,8 +1121,15 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
* restored.
*/
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/*
* Compaction can migrate also non-LRU pages which are
* not accounted to NR_ISOLATED_*. They can be recognized
* as __PageMovable
*/
if (likely(!__PageMovable(page)))
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
}
/*

View file

@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
if (!pte)
return 0;
/* Get target node for single threaded private VMAs */
if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
continue;
/*
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
if (target_node == page_to_nid(page))
continue;
}
ptent = ptep_modify_prot_start(mm, addr, pte);
@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
split_huge_pmd(vma, pmd, addr);
__split_huge_pmd(vma, pmd, addr, false, NULL);
if (pmd_trans_unstable(pmd))
continue;
} else {
@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return do_mprotect_pkey(start, len, prot, -1);
}
#ifdef CONFIG_ARCH_HAS_PKEYS
SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
unsigned long, prot, int, pkey)
{
@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
*/
return ret;
}
#endif /* CONFIG_ARCH_HAS_PKEYS */

View file

@ -2058,8 +2058,12 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
*
* If @force is true, try to unreserve a pageblock even though highatomic
* pageblock is exhausted.
*/
static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
struct zone *zone;
struct page *page;
int order;
bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
ac->nodemask) {
/* Preserve at least one pageblock */
if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
/*
* Preserve at least one pageblock unless memory pressure
* is really high.
*/
if (!force && zone->nr_reserved_highatomic <=
pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
continue;
/*
* It should never happen but changes to locking could
* inadvertently allow a per-cpu drain to add pages
* to MIGRATE_HIGHATOMIC while unreserving so be safe
* and watch for underflows.
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
* in this loop althoug we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
zone->nr_reserved_highatomic);
if (get_pageblock_migratetype(page) ==
MIGRATE_HIGHATOMIC) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
* drain to add pages to MIGRATE_HIGHATOMIC
* while unreserving so be safe and watch for
* underflows.
*/
zone->nr_reserved_highatomic -= min(
pageblock_nr_pages,
zone->nr_reserved_highatomic);
}
/*
* Convert to ac->migratetype and avoid the normal
@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
move_freepages_block(zone, page, ac->migratetype);
spin_unlock_irqrestore(&zone->lock, flags);
return;
ret = move_freepages_block(zone, page, ac->migratetype);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
}
/* Remove an element from the buddy allocator from the fallback list */
@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
if (can_steal)
if (can_steal &&
get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, bool cold)
{
int i;
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
else
list_add_tail(&page->lru, list);
list = &page->lru;
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return i;
return alloced;
}
#ifdef CONFIG_NUMA
@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
&& mt != MIGRATE_HIGHATOMIC)
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@ -3305,7 +3340,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
* Shrink them them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac);
unreserve_highatomic_pageblock(ac, false);
drain_all_pages(NULL);
drained = true;
goto retry;
@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
if (*no_progress_loops > MAX_RECLAIM_RETRIES)
return false;
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
/* Before OOM, exhaust highatomic_reserve */
return unreserve_highatomic_pageblock(ac, true);
}
/*
* Keep reclaiming pages while there is a chance this will lead

View file

@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
size_t pages_size;
struct page **pages;
int unit, i, j, rc;
int upa;
int nr_g0_units;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
upa = ai->alloc_size/ai->unit_size;
nr_g0_units = roundup(num_possible_cpus(), upa);
if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
pcpu_free_alloc_info(ai);
return -EINVAL;
}
unit_pages = ai->unit_size >> PAGE_SHIFT;
@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* allocate pages */
j = 0;
for (unit = 0; unit < num_possible_cpus(); unit++)
for (unit = 0; unit < num_possible_cpus(); unit++) {
unsigned int cpu = ai->groups[0].cpu_map[unit];
for (i = 0; i < unit_pages; i++) {
unsigned int cpu = ai->groups[0].cpu_map[unit];
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
psize_str, cpu);
psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
kmemleak_free(ptr);
pages[j++] = virt_to_page(ptr);
}
}
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;

View file

@ -207,12 +207,21 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
pgoff_t offset, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
int err;
@ -369,9 +378,17 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
unsigned long max = ra->ra_pages;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
pgoff_t prev_offset;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
if (req_size > max_pages && bdi->io_pages > max_pages)
max_pages = min(req_size, bdi->io_pages);
/*
* start of file
*/
@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max);
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
start = page_cache_next_hole(mapping, offset + 1, max);
start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();
if (!start || start - offset > max)
if (!start || start - offset > max_pages)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
/*
* oversize read
*/
if (req_size > max)
if (req_size > max_pages)
goto initial_readahead;
/*
@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max))
if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;
/*
@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
initial_readahead:
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
@ -454,7 +471,7 @@ ondemand_readahead(struct address_space *mapping,
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->async_size = get_next_ra_size(ra, max_pages);
ra->size += ra->async_size;
}

View file

@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
}
/**
* anon_vma_prepare - attach an anon_vma to a memory region
* __anon_vma_prepare - attach an anon_vma to a memory region
* @vma: the memory region in question
*
* This makes sure the memory mapping described by 'vma' has
* an 'anon_vma' attached to it, so that we can associate the
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, but if
* The common case will be that we already have one, which
* is handled inline by anon_vma_prepare(). But if
* not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
*
* This must be called with the mmap_sem held for reading.
*/
int anon_vma_prepare(struct vm_area_struct *vma)
int __anon_vma_prepare(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
might_sleep();
if (unlikely(!anon_vma)) {
struct mm_struct *mm = vma->vm_mm;
struct anon_vma *allocated;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_enomem;
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
}
anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
/* vma reference or self-parent link for new root */
anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(anon_vma);
if (unlikely(allocated))
put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
anon_vma = find_mergeable_anon_vma(vma);
allocated = NULL;
if (!anon_vma) {
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
}
anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
/* vma reference or self-parent link for new root */
anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(anon_vma);
if (unlikely(allocated))
put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
return 0;
out_enomem_free_avc:

View file

@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages)
static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
struct radix_tree_node *node;
void **pslot;
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
if (!pslot)
item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
if (!item)
return -ENOENT;
item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
radix_tree_replace_slot(pslot, replacement);
__radix_tree_replace(&mapping->page_tree, node, pslot,
replacement, NULL, NULL);
return 0;
}
@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
int shmem_huge __read_mostly;
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge)
return "bad_val";
}
}
#endif
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
@ -1539,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct mm_struct *fault_mm, int *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
@ -1589,7 +1592,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
info = SHMEM_I(inode);
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = fault_mm ? : current->mm;
@ -1837,7 +1839,6 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
put_page(page);
}
if (error == -ENOSPC && !once++) {
info = SHMEM_I(inode);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);

129
mm/slab.c
View file

@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
INIT_LIST_HEAD(&parent->slabs_full);
INIT_LIST_HEAD(&parent->slabs_partial);
INIT_LIST_HEAD(&parent->slabs_free);
parent->total_slabs = 0;
parent->free_slabs = 0;
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
parent->num_slabs = 0;
}
#define MAKE_LIST(cachep, listp, slab, nodeid) \
@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
#if DEBUG
struct kmem_cache_node *n;
struct page *page;
unsigned long flags;
int node;
static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@ -1381,32 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
unsigned long active_slabs = 0, num_slabs = 0;
unsigned long num_slabs_partial = 0, num_slabs_free = 0;
unsigned long num_slabs_full;
unsigned long total_slabs, free_slabs, free_objs;
spin_lock_irqsave(&n->list_lock, flags);
num_slabs = n->num_slabs;
list_for_each_entry(page, &n->slabs_partial, lru) {
active_objs += page->active;
num_slabs_partial++;
}
list_for_each_entry(page, &n->slabs_free, lru)
num_slabs_free++;
free_objects += n->free_objects;
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
spin_unlock_irqrestore(&n->list_lock, flags);
num_objs = num_slabs * cachep->num;
active_slabs = num_slabs - num_slabs_free;
num_slabs_full = num_slabs -
(num_slabs_partial + num_slabs_free);
active_objs += (num_slabs_full * cachep->num);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
(total_slabs * cachep->num) - free_objs,
total_slabs * cachep->num);
}
#endif
}
@ -2318,7 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache,
page = list_entry(p, struct page, lru);
list_del(&page->lru);
n->num_slabs--;
n->free_slabs--;
n->total_slabs--;
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@ -2332,7 +2319,7 @@ static int drain_freelist(struct kmem_cache *cache,
return nr_freed;
}
int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0;
int node;
@ -2352,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep, false);
return __kmem_cache_shrink(cachep);
}
void __kmem_cache_release(struct kmem_cache *cachep)
@ -2753,12 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
if (!page->active)
n->total_slabs++;
if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
else
n->free_slabs++;
} else
fixup_slab_list(cachep, n, page, &list);
n->num_slabs++;
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
spin_unlock(&n->list_lock);
@ -2903,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
/* Move pfmemalloc slab to the end of list to speed up next search */
list_del(&page->lru);
if (!page->active)
if (!page->active) {
list_add_tail(&page->lru, &n->slabs_free);
else
n->free_slabs++;
} else
list_add_tail(&page->lru, &n->slabs_partial);
list_for_each_entry(page, &n->slabs_partial, lru) {
@ -2913,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
return page;
}
n->free_touched = 1;
list_for_each_entry(page, &n->slabs_free, lru) {
if (!PageSlabPfmemalloc(page))
if (!PageSlabPfmemalloc(page)) {
n->free_slabs--;
return page;
}
}
return NULL;
@ -2925,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
page = list_first_entry_or_null(&n->slabs_partial,
struct page, lru);
assert_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free,
struct page, lru);
page = list_first_entry_or_null(&n->slabs_free, struct page,
lru);
if (page)
n->free_slabs--;
}
if (sk_memalloc_socks())
return get_valid_first_slab(n, page, pfmemalloc);
page = get_valid_first_slab(n, page, pfmemalloc);
return page;
}
@ -3434,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
if (page->active == 0)
if (page->active == 0) {
list_add(&page->lru, &n->slabs_free);
else {
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
@ -3450,7 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
page = list_last_entry(&n->slabs_free, struct page, lru);
list_move(&page->lru, list);
n->num_slabs--;
n->free_slabs--;
n->total_slabs--;
}
}
@ -4102,64 +4098,33 @@ static void cache_reap(struct work_struct *w)
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
struct page *page;
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
unsigned long num_slabs, free_objects = 0, shared_avail = 0;
unsigned long num_slabs_partial = 0, num_slabs_free = 0;
unsigned long num_slabs_full = 0;
const char *name;
char *error = NULL;
unsigned long active_objs, num_objs, active_slabs;
unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
unsigned long free_slabs = 0;
int node;
struct kmem_cache_node *n;
active_objs = 0;
num_slabs = 0;
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
spin_lock_irq(&n->list_lock);
num_slabs += n->num_slabs;
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
free_objs += n->free_objects;
list_for_each_entry(page, &n->slabs_partial, lru) {
if (page->active == cachep->num && !error)
error = "slabs_partial accounting error";
if (!page->active && !error)
error = "slabs_partial accounting error";
active_objs += page->active;
num_slabs_partial++;
}
list_for_each_entry(page, &n->slabs_free, lru) {
if (page->active && !error)
error = "slabs_free accounting error";
num_slabs_free++;
}
free_objects += n->free_objects;
if (n->shared)
shared_avail += n->shared->avail;
spin_unlock_irq(&n->list_lock);
}
num_objs = num_slabs * cachep->num;
active_slabs = num_slabs - num_slabs_free;
num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
active_objs += (num_slabs_full * cachep->num);
if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";
name = cachep->name;
if (error)
pr_err("slab: cache %s error: %s\n", name, error);
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
active_objs = num_objs - free_objs;
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
sinfo->active_slabs = active_slabs;
sinfo->num_slabs = num_slabs;
sinfo->num_slabs = total_slabs;
sinfo->shared_avail = shared_avail;
sinfo->limit = cachep->limit;
sinfo->batchcount = cachep->batchcount;

View file

@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_CACHE_FLAGS (0)
#endif
/* Common flags available with current configuration */
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
/* Common flags permitted for kmem_cache_create */
#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
SLAB_RED_ZONE | \
SLAB_POISON | \
SLAB_STORE_USER | \
SLAB_TRACE | \
SLAB_CONSISTENCY_CHECKS | \
SLAB_MEM_SPREAD | \
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
SLAB_NOTRACK | \
SLAB_ACCOUNT)
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *, bool);
int __kmem_cache_shrink(struct kmem_cache *);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@ -432,7 +447,8 @@ struct kmem_cache_node {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long num_slabs;
unsigned long total_slabs; /* length of all slab lists */
unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */

View file

@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
goto out_unlock;
}
/* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
get_online_cpus();
get_online_mems();
#ifdef CONFIG_SLUB
/*
* In case of SLUB, we need to disable empty slab caching to
* avoid pinning the offline memory cgroup by freeable kmem
* pages charged to it. SLAB doesn't need this, as it
* periodically purges unused slabs.
*/
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
if (c) {
c->cpu_partial = 0;
c->min_partial = 0;
}
}
mutex_unlock(&slab_mutex);
/*
* kmem_cache->cpu_partial is checked locklessly (see
* put_cpu_partial()). Make sure the change is visible.
*/
synchronize_sched();
#endif
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
if (!is_root_cache(s))
@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
if (!c)
continue;
__kmem_cache_shrink(c, true);
__kmem_cache_shrink(c);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
kasan_cache_shrink(cachep);
ret = __kmem_cache_shrink(cachep, false);
ret = __kmem_cache_shrink(cachep);
put_online_mems();
put_online_cpus();
return ret;

View file

@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
{
}
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}

Some files were not shown because too many files have changed in this diff Show more