mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-29 13:53:33 +00:00
Merge branch 'akpm' (patches from Andrew)
Merge still more updates from Andrew Morton: "16 patches. Subsystems affected by this patch series: ofs2, nilfs2, mailmap, and mm (madvise, mlock, mfence, memory-failure, kasan, debug, kmemleak, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/damon: prevent activated scheme from sleeping by deactivated schemes mm/kmemleak: reset tag when compare object pointer doc/vm/page_owner.rst: remove content related to -c option tools/vm/page_owner_sort.c: remove -c option mm, kasan: fix __GFP_BITS_SHIFT definition breaking LOCKDEP mm,hwpoison: unmap poisoned page before invalidation mailmap: update Kirill's email mm: kfence: fix objcgs vector allocation mm/munlock: protect the per-CPU pagevec by a local_lock_t mm/munlock: update Documentation/vm/unevictable-lru.rst mm/munlock: add lru_add_drain() to fix memcg_stat_test nilfs2: get rid of nilfs_mapping_init() nilfs2: fix lockdep warnings during disk space reclamation nilfs2: fix lockdep warnings in page operations for btree nodes ocfs2: fix crash when mount with quota enabled Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
This commit is contained in:
commit
b012b3235c
33 changed files with 543 additions and 382 deletions
1
.mailmap
1
.mailmap
|
@ -213,6 +213,7 @@ Kees Cook <keescook@chromium.org> <kees@ubuntu.com>
|
|||
Keith Busch <kbusch@kernel.org> <keith.busch@intel.com>
|
||||
Keith Busch <kbusch@kernel.org> <keith.busch@linux.intel.com>
|
||||
Kenneth W Chen <kenneth.w.chen@intel.com>
|
||||
Kirill Tkhai <kirill.tkhai@openvz.org> <ktkhai@virtuozzo.com>
|
||||
Konstantin Khlebnikov <koct9i@gmail.com> <khlebnikov@yandex-team.ru>
|
||||
Konstantin Khlebnikov <koct9i@gmail.com> <k.khlebnikov@samsung.com>
|
||||
Koushik <raghavendra.koushik@neterion.com>
|
||||
|
|
|
@ -125,7 +125,6 @@ Usage
|
|||
additional function:
|
||||
|
||||
Cull:
|
||||
-c Cull by comparing stacktrace instead of total block.
|
||||
--cull <rules>
|
||||
Specify culling rules.Culling syntax is key[,key[,...]].Choose a
|
||||
multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
|
||||
|
|
|
@ -52,8 +52,13 @@ The infrastructure may also be able to handle other conditions that make pages
|
|||
unevictable, either by definition or by circumstance, in the future.
|
||||
|
||||
|
||||
The Unevictable Page List
|
||||
-------------------------
|
||||
The Unevictable LRU Page List
|
||||
-----------------------------
|
||||
|
||||
The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a
|
||||
companion to the LRU-ordered anonymous and file, active and inactive page lists;
|
||||
and now it is not even a page list. But following familiar convention, here in
|
||||
this document and in the source, we often imagine it as a fifth LRU page list.
|
||||
|
||||
The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
|
||||
called the "unevictable" list and an associated page flag, PG_unevictable, to
|
||||
|
@ -63,8 +68,8 @@ The PG_unevictable flag is analogous to, and mutually exclusive with, the
|
|||
PG_active flag in that it indicates on which LRU list a page resides when
|
||||
PG_lru is set.
|
||||
|
||||
The Unevictable LRU infrastructure maintains unevictable pages on an additional
|
||||
LRU list for a few reasons:
|
||||
The Unevictable LRU infrastructure maintains unevictable pages as if they were
|
||||
on an additional LRU list for a few reasons:
|
||||
|
||||
(1) We get to "treat unevictable pages just like we treat other pages in the
|
||||
system - which means we get to use the same code to manipulate them, the
|
||||
|
@ -72,13 +77,11 @@ LRU list for a few reasons:
|
|||
of the statistics, etc..." [Rik van Riel]
|
||||
|
||||
(2) We want to be able to migrate unevictable pages between nodes for memory
|
||||
defragmentation, workload management and memory hotplug. The linux kernel
|
||||
defragmentation, workload management and memory hotplug. The Linux kernel
|
||||
can only migrate pages that it can successfully isolate from the LRU
|
||||
lists. If we were to maintain pages elsewhere than on an LRU-like list,
|
||||
where they can be found by isolate_lru_page(), we would prevent their
|
||||
migration, unless we reworked migration code to find the unevictable pages
|
||||
itself.
|
||||
|
||||
lists (or "Movable" pages: outside of consideration here). If we were to
|
||||
maintain pages elsewhere than on an LRU-like list, where they can be
|
||||
detected by isolate_lru_page(), we would prevent their migration.
|
||||
|
||||
The unevictable list does not differentiate between file-backed and anonymous,
|
||||
swap-backed pages. This differentiation is only important while the pages are,
|
||||
|
@ -92,8 +95,8 @@ Memory Control Group Interaction
|
|||
--------------------------------
|
||||
|
||||
The unevictable LRU facility interacts with the memory control group [aka
|
||||
memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
|
||||
lru_list enum.
|
||||
memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by
|
||||
extending the lru_list enum.
|
||||
|
||||
The memory controller data structure automatically gets a per-node unevictable
|
||||
list as a result of the "arrayification" of the per-node LRU lists (one per
|
||||
|
@ -143,7 +146,6 @@ These are currently used in three places in the kernel:
|
|||
and this mark remains for the life of the inode.
|
||||
|
||||
(2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
|
||||
|
||||
Note that SHM_LOCK is not required to page in the locked pages if they're
|
||||
swapped out; the application must touch the pages manually if it wants to
|
||||
ensure they're in memory.
|
||||
|
@ -156,19 +158,19 @@ These are currently used in three places in the kernel:
|
|||
Detecting Unevictable Pages
|
||||
---------------------------
|
||||
|
||||
The function page_evictable() in vmscan.c determines whether a page is
|
||||
The function page_evictable() in mm/internal.h determines whether a page is
|
||||
evictable or not using the query function outlined above [see section
|
||||
:ref:`Marking address spaces unevictable <mark_addr_space_unevict>`]
|
||||
to check the AS_UNEVICTABLE flag.
|
||||
|
||||
For address spaces that are so marked after being populated (as SHM regions
|
||||
might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
|
||||
might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate
|
||||
the page tables for the region as does, for example, mlock(), nor need it make
|
||||
any special effort to push any pages in the SHM_LOCK'd area to the unevictable
|
||||
list. Instead, vmscan will do this if and when it encounters the pages during
|
||||
a reclamation scan.
|
||||
|
||||
On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
|
||||
On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan
|
||||
the pages in the region and "rescue" them from the unevictable list if no other
|
||||
condition is keeping them unevictable. If an unevictable region is destroyed,
|
||||
the pages are also "rescued" from the unevictable list in the process of
|
||||
|
@ -176,7 +178,7 @@ freeing them.
|
|||
|
||||
page_evictable() also checks for mlocked pages by testing an additional page
|
||||
flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is
|
||||
faulted into a VM_LOCKED vma, or found in a vma being VM_LOCKED.
|
||||
faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED.
|
||||
|
||||
|
||||
Vmscan's Handling of Unevictable Pages
|
||||
|
@ -186,28 +188,23 @@ If unevictable pages are culled in the fault path, or moved to the unevictable
|
|||
list at mlock() or mmap() time, vmscan will not encounter the pages until they
|
||||
have become evictable again (via munlock() for example) and have been "rescued"
|
||||
from the unevictable list. However, there may be situations where we decide,
|
||||
for the sake of expediency, to leave a unevictable page on one of the regular
|
||||
for the sake of expediency, to leave an unevictable page on one of the regular
|
||||
active/inactive LRU lists for vmscan to deal with. vmscan checks for such
|
||||
pages in all of the shrink_{active|inactive|page}_list() functions and will
|
||||
"cull" such pages that it encounters: that is, it diverts those pages to the
|
||||
unevictable list for the node being scanned.
|
||||
unevictable list for the memory cgroup and node being scanned.
|
||||
|
||||
There may be situations where a page is mapped into a VM_LOCKED VMA, but the
|
||||
page is not marked as PG_mlocked. Such pages will make it all the way to
|
||||
shrink_page_list() where they will be detected when vmscan walks the reverse
|
||||
map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK,
|
||||
shrink_page_list() will cull the page at that point.
|
||||
shrink_active_list() or shrink_page_list() where they will be detected when
|
||||
vmscan walks the reverse map in page_referenced() or try_to_unmap(). The page
|
||||
is culled to the unevictable list when it is released by the shrinker.
|
||||
|
||||
To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
|
||||
using putback_lru_page() - the inverse operation to isolate_lru_page() - after
|
||||
dropping the page lock. Because the condition which makes the page unevictable
|
||||
may change once the page is unlocked, putback_lru_page() will recheck the
|
||||
unevictable state of a page that it places on the unevictable list. If the
|
||||
page has become unevictable, putback_lru_page() removes it from the list and
|
||||
retries, including the page_unevictable() test. Because such a race is a rare
|
||||
event and movement of pages onto the unevictable list should be rare, these
|
||||
extra evictabilty checks should not occur in the majority of calls to
|
||||
putback_lru_page().
|
||||
may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the
|
||||
unevictable state of a page before placing it on the unevictable list.
|
||||
|
||||
|
||||
MLOCKED Pages
|
||||
|
@ -227,16 +224,25 @@ Nick posted his patch as an alternative to a patch posted by Christoph Lameter
|
|||
to achieve the same objective: hiding mlocked pages from vmscan.
|
||||
|
||||
In Nick's patch, he used one of the struct page LRU list link fields as a count
|
||||
of VM_LOCKED VMAs that map the page. This use of the link field for a count
|
||||
prevented the management of the pages on an LRU list, and thus mlocked pages
|
||||
were not migratable as isolate_lru_page() could not find them, and the LRU list
|
||||
link field was not available to the migration subsystem.
|
||||
of VM_LOCKED VMAs that map the page (Rik van Riel had the same idea three years
|
||||
earlier). But this use of the link field for a count prevented the management
|
||||
of the pages on an LRU list, and thus mlocked pages were not migratable as
|
||||
isolate_lru_page() could not detect them, and the LRU list link field was not
|
||||
available to the migration subsystem.
|
||||
|
||||
Nick resolved this by putting mlocked pages back on the lru list before
|
||||
Nick resolved this by putting mlocked pages back on the LRU list before
|
||||
attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When
|
||||
Nick's patch was integrated with the Unevictable LRU work, the count was
|
||||
replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
|
||||
mapped the page. More on this below.
|
||||
replaced by walking the reverse map when munlocking, to determine whether any
|
||||
other VM_LOCKED VMAs still mapped the page.
|
||||
|
||||
However, walking the reverse map for each page when munlocking was ugly and
|
||||
inefficient, and could lead to catastrophic contention on a file's rmap lock,
|
||||
when many processes which had it mlocked were trying to exit. In 5.18, the
|
||||
idea of keeping mlock_count in Unevictable LRU list link field was revived and
|
||||
put to work, without preventing the migration of mlocked pages. This is why
|
||||
the "Unevictable LRU list" cannot be a linked list of pages now; but there was
|
||||
no use for that linked list anyway - though its size is maintained for meminfo.
|
||||
|
||||
|
||||
Basic Management
|
||||
|
@ -250,22 +256,18 @@ PageMlocked() functions.
|
|||
A PG_mlocked page will be placed on the unevictable list when it is added to
|
||||
the LRU. Such pages can be "noticed" by memory management in several places:
|
||||
|
||||
(1) in the mlock()/mlockall() system call handlers;
|
||||
(1) in the mlock()/mlock2()/mlockall() system call handlers;
|
||||
|
||||
(2) in the mmap() system call handler when mmapping a region with the
|
||||
MAP_LOCKED flag;
|
||||
|
||||
(3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
|
||||
flag
|
||||
flag;
|
||||
|
||||
(4) in the fault path, if mlocked pages are "culled" in the fault path,
|
||||
and when a VM_LOCKED stack segment is expanded; or
|
||||
(4) in the fault path and when a VM_LOCKED stack segment is expanded; or
|
||||
|
||||
(5) as mentioned above, in vmscan:shrink_page_list() when attempting to
|
||||
reclaim a page in a VM_LOCKED VMA via try_to_unmap()
|
||||
|
||||
all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
|
||||
already have it set.
|
||||
reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
|
||||
|
||||
mlocked pages become unlocked and rescued from the unevictable list when:
|
||||
|
||||
|
@ -280,51 +282,53 @@ mlocked pages become unlocked and rescued from the unevictable list when:
|
|||
(4) before a page is COW'd in a VM_LOCKED VMA.
|
||||
|
||||
|
||||
mlock()/mlockall() System Call Handling
|
||||
---------------------------------------
|
||||
mlock()/mlock2()/mlockall() System Call Handling
|
||||
------------------------------------------------
|
||||
|
||||
Both [do\_]mlock() and [do\_]mlockall() system call handlers call mlock_fixup()
|
||||
mlock(), mlock2() and mlockall() system call handlers proceed to mlock_fixup()
|
||||
for each VMA in the range specified by the call. In the case of mlockall(),
|
||||
this is the entire active address space of the task. Note that mlock_fixup()
|
||||
is used for both mlocking and munlocking a range of memory. A call to mlock()
|
||||
an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
|
||||
treated as a no-op, and mlock_fixup() simply returns.
|
||||
an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED, is
|
||||
treated as a no-op and mlock_fixup() simply returns.
|
||||
|
||||
If the VMA passes some filtering as described in "Filtering Special Vmas"
|
||||
If the VMA passes some filtering as described in "Filtering Special VMAs"
|
||||
below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
|
||||
off a subset of the VMA if the range does not cover the entire VMA. Once the
|
||||
VMA has been merged or split or neither, mlock_fixup() will call
|
||||
populate_vma_page_range() to fault in the pages via get_user_pages() and to
|
||||
mark the pages as mlocked via mlock_vma_page().
|
||||
off a subset of the VMA if the range does not cover the entire VMA. Any pages
|
||||
already present in the VMA are then marked as mlocked by mlock_page() via
|
||||
mlock_pte_range() via walk_page_range() via mlock_vma_pages_range().
|
||||
|
||||
Before returning from the system call, do_mlock() or mlockall() will call
|
||||
__mm_populate() to fault in the remaining pages via get_user_pages() and to
|
||||
mark those pages as mlocked as they are faulted.
|
||||
|
||||
Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
|
||||
get_user_pages() will be unable to fault in the pages. That's okay. If pages
|
||||
do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
|
||||
fault path or in vmscan.
|
||||
do end up getting faulted into this VM_LOCKED VMA, they will be handled in the
|
||||
fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled.
|
||||
|
||||
Also note that a page returned by get_user_pages() could be truncated or
|
||||
migrated out from under us, while we're trying to mlock it. To detect this,
|
||||
populate_vma_page_range() checks page_mapping() after acquiring the page lock.
|
||||
If the page is still associated with its mapping, we'll go ahead and call
|
||||
mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
|
||||
In the worst case, this will result in a page mapped in a VM_LOCKED VMA
|
||||
remaining on a normal LRU list without being PageMlocked(). Again, vmscan will
|
||||
detect and cull such pages.
|
||||
For each PTE (or PMD) being faulted into a VMA, the page add rmap function
|
||||
calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED
|
||||
(unless it is a PTE mapping of a part of a transparent huge page). Or when
|
||||
it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable()
|
||||
calls mlock_new_page() instead: similar to mlock_page(), but can make better
|
||||
judgments, since this page is held exclusively and known not to be on LRU yet.
|
||||
|
||||
mlock_vma_page() will call TestSetPageMlocked() for each page returned by
|
||||
get_user_pages(). We use TestSetPageMlocked() because the page might already
|
||||
be mlocked by another task/VMA and we don't want to do extra work. We
|
||||
especially do not want to count an mlocked page more than once in the
|
||||
statistics. If the page was already mlocked, mlock_vma_page() need do nothing
|
||||
more.
|
||||
mlock_page() sets PageMlocked immediately, then places the page on the CPU's
|
||||
mlock pagevec, to batch up the rest of the work to be done under lru_lock by
|
||||
__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count
|
||||
and moves the page to unevictable state ("the unevictable LRU", but with
|
||||
mlock_count in place of LRU threading). Or if the page was already PageLRU
|
||||
and PageUnevictable and PageMlocked, it simply increments the mlock_count.
|
||||
|
||||
If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
|
||||
page from the LRU, as it is likely on the appropriate active or inactive list
|
||||
at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put
|
||||
back the page - by calling putback_lru_page() - which will notice that the page
|
||||
is now mlocked and divert the page to the node's unevictable list. If
|
||||
mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
|
||||
it later if and when it attempts to reclaim the page.
|
||||
But in practice that may not work ideally: the page may not yet be on an LRU, or
|
||||
it may have been temporarily isolated from LRU. In such cases the mlock_count
|
||||
field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn()
|
||||
returns the page to "LRU". Races prohibit mlock_count from being set to 1 then:
|
||||
rather than risk stranding a page indefinitely as unevictable, always err with
|
||||
mlock_count on the low side, so that when munlocked the page will be rescued to
|
||||
an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a
|
||||
VM_LOCKED VMA.
|
||||
|
||||
|
||||
Filtering Special VMAs
|
||||
|
@ -339,68 +343,48 @@ mlock_fixup() filters several classes of "special" VMAs:
|
|||
so there is no sense in attempting to visit them.
|
||||
|
||||
2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We
|
||||
neither need nor want to mlock() these pages. However, to preserve the
|
||||
prior behavior of mlock() - before the unevictable/mlock changes -
|
||||
mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
|
||||
allocate the huge pages and populate the ptes.
|
||||
neither need nor want to mlock() these pages. But __mm_populate() includes
|
||||
hugetlbfs ranges, allocating the huge pages and populating the PTEs.
|
||||
|
||||
3) VMAs with VM_DONTEXPAND are generally userspace mappings of kernel pages,
|
||||
such as the VDSO page, relay channel pages, etc. These pages
|
||||
are inherently unevictable and are not managed on the LRU lists.
|
||||
mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls
|
||||
make_pages_present() to populate the ptes.
|
||||
such as the VDSO page, relay channel pages, etc. These pages are inherently
|
||||
unevictable and are not managed on the LRU lists. __mm_populate() includes
|
||||
these ranges, populating the PTEs if not already populated.
|
||||
|
||||
4) VMAs with VM_MIXEDMAP set are not marked VM_LOCKED, but __mm_populate()
|
||||
includes these ranges, populating the PTEs if not already populated.
|
||||
|
||||
Note that for all of these special VMAs, mlock_fixup() does not set the
|
||||
VM_LOCKED flag. Therefore, we won't have to deal with them later during
|
||||
munlock(), munmap() or task exit. Neither does mlock_fixup() account these
|
||||
VMAs against the task's "locked_vm".
|
||||
|
||||
.. _munlock_munlockall_handling:
|
||||
|
||||
munlock()/munlockall() System Call Handling
|
||||
-------------------------------------------
|
||||
|
||||
The munlock() and munlockall() system calls are handled by the same functions -
|
||||
do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
|
||||
lock operation indicated by an argument. So, these system calls are also
|
||||
handled by mlock_fixup(). Again, if called for an already munlocked VMA,
|
||||
mlock_fixup() simply returns. Because of the VMA filtering discussed above,
|
||||
VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be
|
||||
ignored for munlock.
|
||||
The munlock() and munlockall() system calls are handled by the same
|
||||
mlock_fixup() function as mlock(), mlock2() and mlockall() system calls are.
|
||||
If called to munlock an already munlocked VMA, mlock_fixup() simply returns.
|
||||
Because of the VMA filtering discussed above, VM_LOCKED will not be set in
|
||||
any "special" VMAs. So, those VMAs will be ignored for munlock.
|
||||
|
||||
If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
|
||||
specified range. The range is then munlocked via the function
|
||||
populate_vma_page_range() - the same function used to mlock a VMA range -
|
||||
passing a flag to indicate that munlock() is being performed.
|
||||
specified range. All pages in the VMA are then munlocked by munlock_page() via
|
||||
mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same
|
||||
function used when mlocking a VMA range, with new flags for the VMA indicating
|
||||
that it is munlock() being performed.
|
||||
|
||||
Because the VMA access protections could have been changed to PROT_NONE after
|
||||
faulting in and mlocking pages, get_user_pages() was unreliable for visiting
|
||||
these pages for munlocking. Because we don't want to leave pages mlocked,
|
||||
get_user_pages() was enhanced to accept a flag to ignore the permissions when
|
||||
fetching the pages - all of which should be resident as a result of previous
|
||||
mlocking.
|
||||
munlock_page() uses the mlock pagevec to batch up work to be done under
|
||||
lru_lock by __munlock_page(). __munlock_page() decrements the page's
|
||||
mlock_count, and when that reaches 0 it clears PageMlocked and clears
|
||||
PageUnevictable, moving the page from unevictable state to inactive LRU.
|
||||
|
||||
For munlock(), populate_vma_page_range() unlocks individual pages by calling
|
||||
munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
|
||||
flag using TestClearPageMlocked(). As with mlock_vma_page(),
|
||||
munlock_vma_page() use the Test*PageMlocked() function to handle the case where
|
||||
the page might have already been unlocked by another task. If the page was
|
||||
mlocked, munlock_vma_page() updates that zone statistics for the number of
|
||||
mlocked pages. Note, however, that at this point we haven't checked whether
|
||||
the page is mapped by other VM_LOCKED VMAs.
|
||||
|
||||
We can't call page_mlock(), the function that walks the reverse map to
|
||||
check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
|
||||
page_mlock() is a variant of try_to_unmap() and thus requires that the page
|
||||
not be on an LRU list [more on these below]. However, the call to
|
||||
isolate_lru_page() could fail, in which case we can't call page_mlock(). So,
|
||||
we go ahead and clear PG_mlocked up front, as this might be the only chance we
|
||||
have. If we can successfully isolate the page, we go ahead and call
|
||||
page_mlock(), which will restore the PG_mlocked flag and update the zone
|
||||
page statistics if it finds another VMA holding the page mlocked. If we fail
|
||||
to isolate the page, we'll have left a potentially mlocked page on the LRU.
|
||||
This is fine, because we'll catch it later if and if vmscan tries to reclaim
|
||||
the page. This should be relatively rare.
|
||||
But in practice that may not work ideally: the page may not yet have reached
|
||||
"the unevictable LRU", or it may have been temporarily isolated from it. In
|
||||
those cases its mlock_count field is unusable and must be assumed to be 0: so
|
||||
that the page will be rescued to an evictable LRU, then perhaps be mlocked
|
||||
again later if vmscan finds it in a VM_LOCKED VMA.
|
||||
|
||||
|
||||
Migrating MLOCKED Pages
|
||||
|
@ -410,33 +394,38 @@ A page that is being migrated has been isolated from the LRU lists and is held
|
|||
locked across unmapping of the page, updating the page's address space entry
|
||||
and copying the contents and state, until the page table entry has been
|
||||
replaced with an entry that refers to the new page. Linux supports migration
|
||||
of mlocked pages and other unevictable pages. This involves simply moving the
|
||||
PG_mlocked and PG_unevictable states from the old page to the new page.
|
||||
of mlocked pages and other unevictable pages. PG_mlocked is cleared from the
|
||||
the old page when it is unmapped from the last VM_LOCKED VMA, and set when the
|
||||
new page is mapped in place of migration entry in a VM_LOCKED VMA. If the page
|
||||
was unevictable because mlocked, PG_unevictable follows PG_mlocked; but if the
|
||||
page was unevictable for other reasons, PG_unevictable is copied explicitly.
|
||||
|
||||
Note that page migration can race with mlocking or munlocking of the same page.
|
||||
This has been discussed from the mlock/munlock perspective in the respective
|
||||
sections above. Both processes (migration and m[un]locking) hold the page
|
||||
locked. This provides the first level of synchronization. Page migration
|
||||
zeros out the page_mapping of the old page before unlocking it, so m[un]lock
|
||||
can skip these pages by testing the page mapping under page lock.
|
||||
There is mostly no problem since page migration requires unmapping all PTEs of
|
||||
the old page (including munlock where VM_LOCKED), then mapping in the new page
|
||||
(including mlock where VM_LOCKED). The page table locks provide sufficient
|
||||
synchronization.
|
||||
|
||||
To complete page migration, we place the new and old pages back onto the LRU
|
||||
after dropping the page lock. The "unneeded" page - old page on success, new
|
||||
page on failure - will be freed when the reference count held by the migration
|
||||
process is released. To ensure that we don't strand pages on the unevictable
|
||||
list because of a race between munlock and migration, page migration uses the
|
||||
putback_lru_page() function to add migrated pages back to the LRU.
|
||||
However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA,
|
||||
before mlocking any pages already present, if one of those pages were migrated
|
||||
before mlock_pte_range() reached it, it would get counted twice in mlock_count.
|
||||
To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO,
|
||||
so that mlock_vma_page() will skip it.
|
||||
|
||||
To complete page migration, we place the old and new pages back onto the LRU
|
||||
afterwards. The "unneeded" page - old page on success, new page on failure -
|
||||
is freed when the reference count held by the migration process is released.
|
||||
|
||||
|
||||
Compacting MLOCKED Pages
|
||||
------------------------
|
||||
|
||||
The unevictable LRU can be scanned for compactable regions and the default
|
||||
behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls
|
||||
this behavior (see Documentation/admin-guide/sysctl/vm.rst). Once scanning of the
|
||||
unevictable LRU is enabled, the work of compaction is mostly handled by
|
||||
the page migration code and the same work flow as described in MIGRATING
|
||||
MLOCKED PAGES will apply.
|
||||
The memory map can be scanned for compactable regions and the default behavior
|
||||
is to let unevictable pages be moved. /proc/sys/vm/compact_unevictable_allowed
|
||||
controls this behavior (see Documentation/admin-guide/sysctl/vm.rst). The work
|
||||
of compaction is mostly handled by the page migration code and the same work
|
||||
flow as described in Migrating MLOCKED Pages will apply.
|
||||
|
||||
|
||||
MLOCKING Transparent Huge Pages
|
||||
-------------------------------
|
||||
|
@ -445,51 +434,44 @@ A transparent huge page is represented by a single entry on an LRU list.
|
|||
Therefore, we can only make unevictable an entire compound page, not
|
||||
individual subpages.
|
||||
|
||||
If a user tries to mlock() part of a huge page, we want the rest of the
|
||||
page to be reclaimable.
|
||||
If a user tries to mlock() part of a huge page, and no user mlock()s the
|
||||
whole of the huge page, we want the rest of the page to be reclaimable.
|
||||
|
||||
We cannot just split the page on partial mlock() as split_huge_page() can
|
||||
fail and new intermittent failure mode for the syscall is undesirable.
|
||||
fail and a new intermittent failure mode for the syscall is undesirable.
|
||||
|
||||
We handle this by keeping PTE-mapped huge pages on normal LRU lists: the
|
||||
PMD on border of VM_LOCKED VMA will be split into PTE table.
|
||||
We handle this by keeping PTE-mlocked huge pages on evictable LRU lists:
|
||||
the PMD on the border of a VM_LOCKED VMA will be split into a PTE table.
|
||||
|
||||
This way the huge page is accessible for vmscan. Under memory pressure the
|
||||
This way the huge page is accessible for vmscan. Under memory pressure the
|
||||
page will be split, subpages which belong to VM_LOCKED VMAs will be moved
|
||||
to unevictable LRU and the rest can be reclaimed.
|
||||
to the unevictable LRU and the rest can be reclaimed.
|
||||
|
||||
/proc/meminfo's Unevictable and Mlocked amounts do not include those parts
|
||||
of a transparent huge page which are mapped only by PTEs in VM_LOCKED VMAs.
|
||||
|
||||
See also comment in follow_trans_huge_pmd().
|
||||
|
||||
mmap(MAP_LOCKED) System Call Handling
|
||||
-------------------------------------
|
||||
|
||||
In addition the mlock()/mlockall() system calls, an application can request
|
||||
that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
|
||||
call. There is one important and subtle difference here, though. mmap() + mlock()
|
||||
will fail if the range cannot be faulted in (e.g. because mm_populate fails)
|
||||
and returns with ENOMEM while mmap(MAP_LOCKED) will not fail. The mmaped
|
||||
area will still have properties of the locked area - aka. pages will not get
|
||||
swapped out - but major page faults to fault memory in might still happen.
|
||||
In addition to the mlock(), mlock2() and mlockall() system calls, an application
|
||||
can request that a region of memory be mlocked by supplying the MAP_LOCKED flag
|
||||
to the mmap() call. There is one important and subtle difference here, though.
|
||||
mmap() + mlock() will fail if the range cannot be faulted in (e.g. because
|
||||
mm_populate fails) and returns with ENOMEM while mmap(MAP_LOCKED) will not fail.
|
||||
The mmaped area will still have properties of the locked area - pages will not
|
||||
get swapped out - but major page faults to fault memory in might still happen.
|
||||
|
||||
Furthermore, any mmap() call or brk() call that expands the heap by a
|
||||
task that has previously called mlockall() with the MCL_FUTURE flag will result
|
||||
Furthermore, any mmap() call or brk() call that expands the heap by a task
|
||||
that has previously called mlockall() with the MCL_FUTURE flag will result
|
||||
in the newly mapped memory being mlocked. Before the unevictable/mlock
|
||||
changes, the kernel simply called make_pages_present() to allocate pages and
|
||||
populate the page table.
|
||||
changes, the kernel simply called make_pages_present() to allocate pages
|
||||
and populate the page table.
|
||||
|
||||
To mlock a range of memory under the unevictable/mlock infrastructure, the
|
||||
mmap() handler and task address space expansion functions call
|
||||
To mlock a range of memory under the unevictable/mlock infrastructure,
|
||||
the mmap() handler and task address space expansion functions call
|
||||
populate_vma_page_range() specifying the vma and the address range to mlock.
|
||||
|
||||
The callers of populate_vma_page_range() will have already added the memory range
|
||||
to be mlocked to the task's "locked_vm". To account for filtered VMAs,
|
||||
populate_vma_page_range() returns the number of pages NOT mlocked. All of the
|
||||
callers then subtract a non-negative return value from the task's locked_vm. A
|
||||
negative return value represent an error - for example, from get_user_pages()
|
||||
attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
|
||||
memory range accounted as locked_vm, as the protections could be changed later
|
||||
and pages allocated into that region.
|
||||
|
||||
|
||||
munmap()/exit()/exec() System Call Handling
|
||||
-------------------------------------------
|
||||
|
@ -500,81 +482,53 @@ munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
|
|||
Before the unevictable/mlock changes, mlocking did not mark the pages in any
|
||||
way, so unmapping them required no processing.
|
||||
|
||||
To munlock a range of memory under the unevictable/mlock infrastructure, the
|
||||
munmap() handler and task address space call tear down function
|
||||
munlock_vma_pages_all(). The name reflects the observation that one always
|
||||
specifies the entire VMA range when munlock()ing during unmap of a region.
|
||||
Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
|
||||
actually contain mlocked pages will be passed to munlock_vma_pages_all().
|
||||
For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
|
||||
munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
|
||||
(unless it was a PTE mapping of a part of a transparent huge page).
|
||||
|
||||
munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
|
||||
for the munlock case, calls __munlock_vma_pages_range() to walk the page table
|
||||
for the VMA's memory range and munlock_vma_page() each resident page mapped by
|
||||
the VMA. This effectively munlocks the page, only if this is the last
|
||||
VM_LOCKED VMA that maps the page.
|
||||
munlock_page() uses the mlock pagevec to batch up work to be done under
|
||||
lru_lock by __munlock_page(). __munlock_page() decrements the page's
|
||||
mlock_count, and when that reaches 0 it clears PageMlocked and clears
|
||||
PageUnevictable, moving the page from unevictable state to inactive LRU.
|
||||
|
||||
But in practice that may not work ideally: the page may not yet have reached
|
||||
"the unevictable LRU", or it may have been temporarily isolated from it. In
|
||||
those cases its mlock_count field is unusable and must be assumed to be 0: so
|
||||
that the page will be rescued to an evictable LRU, then perhaps be mlocked
|
||||
again later if vmscan finds it in a VM_LOCKED VMA.
|
||||
|
||||
|
||||
try_to_unmap()
|
||||
--------------
|
||||
Truncating MLOCKED Pages
|
||||
------------------------
|
||||
|
||||
Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may
|
||||
have VM_LOCKED flag set. It is possible for a page mapped into one or more
|
||||
VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
|
||||
of the active or inactive LRU lists. This could happen if, for example, a task
|
||||
in the process of munlocking the page could not isolate the page from the LRU.
|
||||
As a result, vmscan/shrink_page_list() might encounter such a page as described
|
||||
in section "vmscan's handling of unevictable pages". To handle this situation,
|
||||
try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
|
||||
map.
|
||||
File truncation or hole punching forcibly unmaps the deleted pages from
|
||||
userspace; truncation even unmaps and deletes any private anonymous pages
|
||||
which had been Copied-On-Write from the file pages now being truncated.
|
||||
|
||||
try_to_unmap() is always called, by either vmscan for reclaim or for page
|
||||
migration, with the argument page locked and isolated from the LRU. Separate
|
||||
functions handle anonymous and mapped file and KSM pages, as these types of
|
||||
pages have different reverse map lookup mechanisms, with different locking.
|
||||
In each case, whether rmap_walk_anon() or rmap_walk_file() or rmap_walk_ksm(),
|
||||
it will call try_to_unmap_one() for every VMA which might contain the page.
|
||||
Mlocked pages can be munlocked and deleted in this way: like with munmap(),
|
||||
for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls
|
||||
munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED
|
||||
(unless it was a PTE mapping of a part of a transparent huge page).
|
||||
|
||||
When trying to reclaim, if try_to_unmap_one() finds the page in a VM_LOCKED
|
||||
VMA, it will then mlock the page via mlock_vma_page() instead of unmapping it,
|
||||
and return SWAP_MLOCK to indicate that the page is unevictable: and the scan
|
||||
stops there.
|
||||
|
||||
mlock_vma_page() is called while holding the page table's lock (in addition
|
||||
to the page lock, and the rmap lock): to serialize against concurrent mlock or
|
||||
munlock or munmap system calls, mm teardown (munlock_vma_pages_all), reclaim,
|
||||
holepunching, and truncation of file pages and their anonymous COWed pages.
|
||||
|
||||
|
||||
page_mlock() Reverse Map Scan
|
||||
---------------------------------
|
||||
|
||||
When munlock_vma_page() [see section :ref:`munlock()/munlockall() System Call
|
||||
Handling <munlock_munlockall_handling>` above] tries to munlock a
|
||||
page, it needs to determine whether or not the page is mapped by any
|
||||
VM_LOCKED VMA without actually attempting to unmap all PTEs from the
|
||||
page. For this purpose, the unevictable/mlock infrastructure
|
||||
introduced a variant of try_to_unmap() called page_mlock().
|
||||
|
||||
page_mlock() walks the respective reverse maps looking for VM_LOCKED VMAs. When
|
||||
such a VMA is found the page is mlocked via mlock_vma_page(). This undoes the
|
||||
pre-clearing of the page's PG_mlocked done by munlock_vma_page.
|
||||
|
||||
Note that page_mlock()'s reverse map walk must visit every VMA in a page's
|
||||
reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
|
||||
However, the scan can terminate when it encounters a VM_LOCKED VMA.
|
||||
Although page_mlock() might be called a great many times when munlocking a
|
||||
large region or tearing down a large address space that has been mlocked via
|
||||
mlockall(), overall this is a fairly rare event.
|
||||
However, if there is a racing munlock(), since mlock_vma_pages_range() starts
|
||||
munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages
|
||||
present, if one of those pages were unmapped by truncation or hole punch before
|
||||
mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA,
|
||||
and would not be counted out of mlock_count. In this rare case, a page may
|
||||
still appear as PageMlocked after it has been fully unmapped: and it is left to
|
||||
release_pages() (or __page_cache_release()) to clear it and update statistics
|
||||
before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared,
|
||||
which is usually 0).
|
||||
|
||||
|
||||
Page Reclaim in shrink_*_list()
|
||||
-------------------------------
|
||||
|
||||
shrink_active_list() culls any obviously unevictable pages - i.e.
|
||||
!page_evictable(page) - diverting these to the unevictable list.
|
||||
vmscan's shrink_active_list() culls any obviously unevictable pages -
|
||||
i.e. !page_evictable(page) pages - diverting those to the unevictable list.
|
||||
However, shrink_active_list() only sees unevictable pages that made it onto the
|
||||
active/inactive lru lists. Note that these pages do not have PageUnevictable
|
||||
set - otherwise they would be on the unevictable list and shrink_active_list
|
||||
active/inactive LRU lists. Note that these pages do not have PageUnevictable
|
||||
set - otherwise they would be on the unevictable list and shrink_active_list()
|
||||
would never see them.
|
||||
|
||||
Some examples of these unevictable pages on the LRU lists are:
|
||||
|
@ -586,20 +540,15 @@ Some examples of these unevictable pages on the LRU lists are:
|
|||
when an application accesses the page the first time after SHM_LOCK'ing
|
||||
the segment.
|
||||
|
||||
(3) mlocked pages that could not be isolated from the LRU and moved to the
|
||||
unevictable list in mlock_vma_page().
|
||||
(3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked,
|
||||
but events left mlock_count too low, so they were munlocked too early.
|
||||
|
||||
shrink_inactive_list() also diverts any unevictable pages that it finds on the
|
||||
inactive lists to the appropriate node's unevictable list.
|
||||
vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
|
||||
unevictable pages found on the inactive lists to the appropriate memory cgroup
|
||||
and node unevictable list.
|
||||
|
||||
shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
|
||||
after shrink_active_list() had moved them to the inactive list, or pages mapped
|
||||
into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
|
||||
recheck via page_mlock(). shrink_inactive_list() won't notice the latter,
|
||||
but will pass on to shrink_page_list().
|
||||
|
||||
shrink_page_list() again culls obviously unevictable pages that it could
|
||||
encounter for similar reason to shrink_inactive_list(). Pages mapped into
|
||||
VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
|
||||
try_to_unmap(). shrink_page_list() will divert them to the unevictable list
|
||||
when try_to_unmap() returns SWAP_MLOCK, as discussed above.
|
||||
rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
|
||||
shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
|
||||
check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
|
||||
to correct them. Such pages are culled to the unevictable list when released
|
||||
by the shrinker.
|
||||
|
|
|
@ -20,6 +20,23 @@
|
|||
#include "page.h"
|
||||
#include "btnode.h"
|
||||
|
||||
|
||||
/**
|
||||
* nilfs_init_btnc_inode - initialize B-tree node cache inode
|
||||
* @btnc_inode: inode to be initialized
|
||||
*
|
||||
* nilfs_init_btnc_inode() sets up an inode for B-tree node cache.
|
||||
*/
|
||||
void nilfs_init_btnc_inode(struct inode *btnc_inode)
|
||||
{
|
||||
struct nilfs_inode_info *ii = NILFS_I(btnc_inode);
|
||||
|
||||
btnc_inode->i_mode = S_IFREG;
|
||||
ii->i_flags = 0;
|
||||
memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
|
||||
mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
|
||||
}
|
||||
|
||||
void nilfs_btnode_cache_clear(struct address_space *btnc)
|
||||
{
|
||||
invalidate_mapping_pages(btnc, 0, -1);
|
||||
|
@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc)
|
|||
struct buffer_head *
|
||||
nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
|
||||
{
|
||||
struct inode *inode = NILFS_BTNC_I(btnc);
|
||||
struct inode *inode = btnc->host;
|
||||
struct buffer_head *bh;
|
||||
|
||||
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
|
||||
|
@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
|
|||
struct buffer_head **pbh, sector_t *submit_ptr)
|
||||
{
|
||||
struct buffer_head *bh;
|
||||
struct inode *inode = NILFS_BTNC_I(btnc);
|
||||
struct inode *inode = btnc->host;
|
||||
struct page *page;
|
||||
int err;
|
||||
|
||||
|
@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
|
|||
struct nilfs_btnode_chkey_ctxt *ctxt)
|
||||
{
|
||||
struct buffer_head *obh, *nbh;
|
||||
struct inode *inode = NILFS_BTNC_I(btnc);
|
||||
struct inode *inode = btnc->host;
|
||||
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
|
||||
int err;
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt {
|
|||
struct buffer_head *newbh;
|
||||
};
|
||||
|
||||
void nilfs_init_btnc_inode(struct inode *btnc_inode);
|
||||
void nilfs_btnode_cache_clear(struct address_space *);
|
||||
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
|
||||
__u64 blocknr);
|
||||
|
|
|
@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
|
|||
static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
|
||||
__u64 ptr, struct buffer_head **bhp)
|
||||
{
|
||||
struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
|
||||
struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
|
||||
struct address_space *btnc = btnc_inode->i_mapping;
|
||||
struct buffer_head *bh;
|
||||
|
||||
bh = nilfs_btnode_create_block(btnc, ptr);
|
||||
|
@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
|
|||
struct buffer_head **bhp,
|
||||
const struct nilfs_btree_readahead_info *ra)
|
||||
{
|
||||
struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
|
||||
struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
|
||||
struct address_space *btnc = btnc_inode->i_mapping;
|
||||
struct buffer_head *bh, *ra_bh;
|
||||
sector_t submit_ptr = 0;
|
||||
int ret;
|
||||
|
@ -1741,6 +1743,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
|
|||
dat = nilfs_bmap_get_dat(btree);
|
||||
}
|
||||
|
||||
ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
@ -1913,7 +1919,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
|
|||
path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
|
||||
path[level].bp_ctxt.bh = path[level].bp_bh;
|
||||
ret = nilfs_btnode_prepare_change_key(
|
||||
&NILFS_BMAP_I(btree)->i_btnode_cache,
|
||||
NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
|
||||
&path[level].bp_ctxt);
|
||||
if (ret < 0) {
|
||||
nilfs_dat_abort_update(dat,
|
||||
|
@ -1939,7 +1945,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
|
|||
|
||||
if (buffer_nilfs_node(path[level].bp_bh)) {
|
||||
nilfs_btnode_commit_change_key(
|
||||
&NILFS_BMAP_I(btree)->i_btnode_cache,
|
||||
NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
|
||||
&path[level].bp_ctxt);
|
||||
path[level].bp_bh = path[level].bp_ctxt.bh;
|
||||
}
|
||||
|
@ -1958,7 +1964,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
|
|||
&path[level].bp_newreq.bpr_req);
|
||||
if (buffer_nilfs_node(path[level].bp_bh))
|
||||
nilfs_btnode_abort_change_key(
|
||||
&NILFS_BMAP_I(btree)->i_btnode_cache,
|
||||
NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
|
||||
&path[level].bp_ctxt);
|
||||
}
|
||||
|
||||
|
@ -2134,7 +2140,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
|
|||
static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
|
||||
struct list_head *listp)
|
||||
{
|
||||
struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
|
||||
struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
|
||||
struct address_space *btcache = btnc_inode->i_mapping;
|
||||
struct list_head lists[NILFS_BTREE_LEVEL_MAX];
|
||||
struct pagevec pvec;
|
||||
struct buffer_head *bh, *head;
|
||||
|
@ -2188,12 +2195,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
|
|||
path[level].bp_ctxt.newkey = blocknr;
|
||||
path[level].bp_ctxt.bh = *bh;
|
||||
ret = nilfs_btnode_prepare_change_key(
|
||||
&NILFS_BMAP_I(btree)->i_btnode_cache,
|
||||
NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
|
||||
&path[level].bp_ctxt);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
nilfs_btnode_commit_change_key(
|
||||
&NILFS_BMAP_I(btree)->i_btnode_cache,
|
||||
NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
|
||||
&path[level].bp_ctxt);
|
||||
*bh = path[level].bp_ctxt.bh;
|
||||
}
|
||||
|
@ -2398,6 +2405,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
|
|||
|
||||
if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
|
||||
ret = -EIO;
|
||||
else
|
||||
ret = nilfs_attach_btree_node_cache(
|
||||
&NILFS_BMAP_I(bmap)->vfs_inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
|
|||
di = NILFS_DAT_I(dat);
|
||||
lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
|
||||
nilfs_palloc_setup_cache(dat, &di->palloc_cache);
|
||||
nilfs_mdt_setup_shadow_map(dat, &di->shadow);
|
||||
err = nilfs_mdt_setup_shadow_map(dat, &di->shadow);
|
||||
if (err)
|
||||
goto failed;
|
||||
|
||||
err = nilfs_read_inode_common(dat, raw_inode);
|
||||
if (err)
|
||||
|
|
|
@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
|
|||
int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
|
||||
__u64 vbn, struct buffer_head **out_bh)
|
||||
{
|
||||
struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
|
||||
int ret;
|
||||
|
||||
ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
|
||||
ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
|
||||
vbn ? : pbn, pbn, REQ_OP_READ, 0,
|
||||
out_bh, &pbn);
|
||||
if (ret == -EEXIST) /* internal code (cache hit) */
|
||||
|
@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
|
|||
ii->i_flags = 0;
|
||||
nilfs_bmap_init_gc(ii->i_bmap);
|
||||
|
||||
return 0;
|
||||
return nilfs_attach_btree_node_cache(inode);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs)
|
|||
ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
|
||||
list_del_init(&ii->i_dirty);
|
||||
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
|
||||
nilfs_btnode_cache_clear(&ii->i_btnode_cache);
|
||||
nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
|
||||
iput(&ii->vfs_inode);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,12 +29,16 @@
|
|||
* @cno: checkpoint number
|
||||
* @root: pointer on NILFS root object (mounted checkpoint)
|
||||
* @for_gc: inode for GC flag
|
||||
* @for_btnc: inode for B-tree node cache flag
|
||||
* @for_shadow: inode for shadowed page cache flag
|
||||
*/
|
||||
struct nilfs_iget_args {
|
||||
u64 ino;
|
||||
__u64 cno;
|
||||
struct nilfs_root *root;
|
||||
int for_gc;
|
||||
bool for_gc;
|
||||
bool for_btnc;
|
||||
bool for_shadow;
|
||||
};
|
||||
|
||||
static int nilfs_iget_test(struct inode *inode, void *opaque);
|
||||
|
@ -312,7 +316,8 @@ static int nilfs_insert_inode_locked(struct inode *inode,
|
|||
unsigned long ino)
|
||||
{
|
||||
struct nilfs_iget_args args = {
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = 0
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = false,
|
||||
.for_btnc = false, .for_shadow = false
|
||||
};
|
||||
|
||||
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
|
||||
|
@ -525,6 +530,19 @@ static int nilfs_iget_test(struct inode *inode, void *opaque)
|
|||
return 0;
|
||||
|
||||
ii = NILFS_I(inode);
|
||||
if (test_bit(NILFS_I_BTNC, &ii->i_state)) {
|
||||
if (!args->for_btnc)
|
||||
return 0;
|
||||
} else if (args->for_btnc) {
|
||||
return 0;
|
||||
}
|
||||
if (test_bit(NILFS_I_SHADOW, &ii->i_state)) {
|
||||
if (!args->for_shadow)
|
||||
return 0;
|
||||
} else if (args->for_shadow) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
|
||||
return !args->for_gc;
|
||||
|
||||
|
@ -536,15 +554,17 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
|
|||
struct nilfs_iget_args *args = opaque;
|
||||
|
||||
inode->i_ino = args->ino;
|
||||
if (args->for_gc) {
|
||||
NILFS_I(inode)->i_cno = args->cno;
|
||||
NILFS_I(inode)->i_root = args->root;
|
||||
if (args->root && args->ino == NILFS_ROOT_INO)
|
||||
nilfs_get_root(args->root);
|
||||
|
||||
if (args->for_gc)
|
||||
NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
|
||||
NILFS_I(inode)->i_cno = args->cno;
|
||||
NILFS_I(inode)->i_root = NULL;
|
||||
} else {
|
||||
if (args->root && args->ino == NILFS_ROOT_INO)
|
||||
nilfs_get_root(args->root);
|
||||
NILFS_I(inode)->i_root = args->root;
|
||||
}
|
||||
if (args->for_btnc)
|
||||
NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC);
|
||||
if (args->for_shadow)
|
||||
NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -552,7 +572,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
|
|||
unsigned long ino)
|
||||
{
|
||||
struct nilfs_iget_args args = {
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = 0
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = false,
|
||||
.for_btnc = false, .for_shadow = false
|
||||
};
|
||||
|
||||
return ilookup5(sb, ino, nilfs_iget_test, &args);
|
||||
|
@ -562,7 +583,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
|
|||
unsigned long ino)
|
||||
{
|
||||
struct nilfs_iget_args args = {
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = 0
|
||||
.ino = ino, .root = root, .cno = 0, .for_gc = false,
|
||||
.for_btnc = false, .for_shadow = false
|
||||
};
|
||||
|
||||
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
|
||||
|
@ -593,7 +615,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
|
|||
__u64 cno)
|
||||
{
|
||||
struct nilfs_iget_args args = {
|
||||
.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
|
||||
.ino = ino, .root = NULL, .cno = cno, .for_gc = true,
|
||||
.for_btnc = false, .for_shadow = false
|
||||
};
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
@ -613,6 +636,113 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
|
|||
return inode;
|
||||
}
|
||||
|
||||
/**
|
||||
* nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode
|
||||
* @inode: inode object
|
||||
*
|
||||
* nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode,
|
||||
* or does nothing if the inode already has it. This function allocates
|
||||
* an additional inode to maintain page cache of B-tree nodes one-on-one.
|
||||
*
|
||||
* Return Value: On success, 0 is returned. On errors, one of the following
|
||||
* negative error code is returned.
|
||||
*
|
||||
* %-ENOMEM - Insufficient memory available.
|
||||
*/
|
||||
int nilfs_attach_btree_node_cache(struct inode *inode)
|
||||
{
|
||||
struct nilfs_inode_info *ii = NILFS_I(inode);
|
||||
struct inode *btnc_inode;
|
||||
struct nilfs_iget_args args;
|
||||
|
||||
if (ii->i_assoc_inode)
|
||||
return 0;
|
||||
|
||||
args.ino = inode->i_ino;
|
||||
args.root = ii->i_root;
|
||||
args.cno = ii->i_cno;
|
||||
args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0;
|
||||
args.for_btnc = true;
|
||||
args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
|
||||
|
||||
btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
|
||||
nilfs_iget_set, &args);
|
||||
if (unlikely(!btnc_inode))
|
||||
return -ENOMEM;
|
||||
if (btnc_inode->i_state & I_NEW) {
|
||||
nilfs_init_btnc_inode(btnc_inode);
|
||||
unlock_new_inode(btnc_inode);
|
||||
}
|
||||
NILFS_I(btnc_inode)->i_assoc_inode = inode;
|
||||
NILFS_I(btnc_inode)->i_bmap = ii->i_bmap;
|
||||
ii->i_assoc_inode = btnc_inode;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode
|
||||
* @inode: inode object
|
||||
*
|
||||
* nilfs_detach_btree_node_cache() detaches the B-tree node cache and its
|
||||
* holder inode bound to @inode, or does nothing if @inode doesn't have it.
|
||||
*/
|
||||
void nilfs_detach_btree_node_cache(struct inode *inode)
|
||||
{
|
||||
struct nilfs_inode_info *ii = NILFS_I(inode);
|
||||
struct inode *btnc_inode = ii->i_assoc_inode;
|
||||
|
||||
if (btnc_inode) {
|
||||
NILFS_I(btnc_inode)->i_assoc_inode = NULL;
|
||||
ii->i_assoc_inode = NULL;
|
||||
iput(btnc_inode);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* nilfs_iget_for_shadow - obtain inode for shadow mapping
|
||||
* @inode: inode object that uses shadow mapping
|
||||
*
|
||||
* nilfs_iget_for_shadow() allocates a pair of inodes that holds page
|
||||
* caches for shadow mapping. The page cache for data pages is set up
|
||||
* in one inode and the one for b-tree node pages is set up in the
|
||||
* other inode, which is attached to the former inode.
|
||||
*
|
||||
* Return Value: On success, a pointer to the inode for data pages is
|
||||
* returned. On errors, one of the following negative error code is returned
|
||||
* in a pointer type.
|
||||
*
|
||||
* %-ENOMEM - Insufficient memory available.
|
||||
*/
|
||||
struct inode *nilfs_iget_for_shadow(struct inode *inode)
|
||||
{
|
||||
struct nilfs_iget_args args = {
|
||||
.ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false,
|
||||
.for_btnc = false, .for_shadow = true
|
||||
};
|
||||
struct inode *s_inode;
|
||||
int err;
|
||||
|
||||
s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
|
||||
nilfs_iget_set, &args);
|
||||
if (unlikely(!s_inode))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (!(s_inode->i_state & I_NEW))
|
||||
return inode;
|
||||
|
||||
NILFS_I(s_inode)->i_flags = 0;
|
||||
memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
|
||||
mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
|
||||
|
||||
err = nilfs_attach_btree_node_cache(s_inode);
|
||||
if (unlikely(err)) {
|
||||
iget_failed(s_inode);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
unlock_new_inode(s_inode);
|
||||
return s_inode;
|
||||
}
|
||||
|
||||
void nilfs_write_inode_common(struct inode *inode,
|
||||
struct nilfs_inode *raw_inode, int has_bmap)
|
||||
{
|
||||
|
@ -760,7 +890,8 @@ static void nilfs_clear_inode(struct inode *inode)
|
|||
if (test_bit(NILFS_I_BMAP, &ii->i_state))
|
||||
nilfs_bmap_clear(ii->i_bmap);
|
||||
|
||||
nilfs_btnode_cache_clear(&ii->i_btnode_cache);
|
||||
if (!test_bit(NILFS_I_BTNC, &ii->i_state))
|
||||
nilfs_detach_btree_node_cache(inode);
|
||||
|
||||
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
|
||||
nilfs_put_root(ii->i_root);
|
||||
|
|
|
@ -471,9 +471,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
|
|||
void nilfs_mdt_clear(struct inode *inode)
|
||||
{
|
||||
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
|
||||
struct nilfs_shadow_map *shadow = mdi->mi_shadow;
|
||||
|
||||
if (mdi->mi_palloc_cache)
|
||||
nilfs_palloc_destroy_cache(inode);
|
||||
|
||||
if (shadow) {
|
||||
struct inode *s_inode = shadow->inode;
|
||||
|
||||
shadow->inode = NULL;
|
||||
iput(s_inode);
|
||||
mdi->mi_shadow = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -507,12 +516,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
|
|||
struct nilfs_shadow_map *shadow)
|
||||
{
|
||||
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
|
||||
struct inode *s_inode;
|
||||
|
||||
INIT_LIST_HEAD(&shadow->frozen_buffers);
|
||||
address_space_init_once(&shadow->frozen_data);
|
||||
nilfs_mapping_init(&shadow->frozen_data, inode);
|
||||
address_space_init_once(&shadow->frozen_btnodes);
|
||||
nilfs_mapping_init(&shadow->frozen_btnodes, inode);
|
||||
|
||||
s_inode = nilfs_iget_for_shadow(inode);
|
||||
if (IS_ERR(s_inode))
|
||||
return PTR_ERR(s_inode);
|
||||
|
||||
shadow->inode = s_inode;
|
||||
mi->mi_shadow = shadow;
|
||||
return 0;
|
||||
}
|
||||
|
@ -526,14 +538,15 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode)
|
|||
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
|
||||
struct nilfs_inode_info *ii = NILFS_I(inode);
|
||||
struct nilfs_shadow_map *shadow = mi->mi_shadow;
|
||||
struct inode *s_inode = shadow->inode;
|
||||
int ret;
|
||||
|
||||
ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
|
||||
ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
|
||||
&ii->i_btnode_cache);
|
||||
ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping,
|
||||
ii->i_assoc_inode->i_mapping);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
@ -549,7 +562,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
|
|||
struct page *page;
|
||||
int blkbits = inode->i_blkbits;
|
||||
|
||||
page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
|
||||
page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -581,7 +594,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
|
|||
struct page *page;
|
||||
int n;
|
||||
|
||||
page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
|
||||
page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index);
|
||||
if (page) {
|
||||
if (page_has_buffers(page)) {
|
||||
n = bh_offset(bh) >> inode->i_blkbits;
|
||||
|
@ -622,10 +635,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
|
|||
nilfs_palloc_clear_cache(inode);
|
||||
|
||||
nilfs_clear_dirty_pages(inode->i_mapping, true);
|
||||
nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
|
||||
nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
|
||||
|
||||
nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
|
||||
nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
|
||||
nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true);
|
||||
nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
|
||||
NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
|
||||
|
||||
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
|
||||
|
||||
|
@ -640,10 +654,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode)
|
|||
{
|
||||
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
|
||||
struct nilfs_shadow_map *shadow = mi->mi_shadow;
|
||||
struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;
|
||||
|
||||
down_write(&mi->mi_sem);
|
||||
nilfs_release_frozen_buffers(shadow);
|
||||
truncate_inode_pages(&shadow->frozen_data, 0);
|
||||
truncate_inode_pages(&shadow->frozen_btnodes, 0);
|
||||
truncate_inode_pages(shadow->inode->i_mapping, 0);
|
||||
truncate_inode_pages(shadow_btnc_inode->i_mapping, 0);
|
||||
up_write(&mi->mi_sem);
|
||||
}
|
||||
|
|
|
@ -18,14 +18,12 @@
|
|||
/**
|
||||
* struct nilfs_shadow_map - shadow mapping of meta data file
|
||||
* @bmap_store: shadow copy of bmap state
|
||||
* @frozen_data: shadowed dirty data pages
|
||||
* @frozen_btnodes: shadowed dirty b-tree nodes' pages
|
||||
* @inode: holder of page caches used in shadow mapping
|
||||
* @frozen_buffers: list of frozen buffers
|
||||
*/
|
||||
struct nilfs_shadow_map {
|
||||
struct nilfs_bmap_store bmap_store;
|
||||
struct address_space frozen_data;
|
||||
struct address_space frozen_btnodes;
|
||||
struct inode *inode;
|
||||
struct list_head frozen_buffers;
|
||||
};
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
* @i_xattr: <TODO>
|
||||
* @i_dir_start_lookup: page index of last successful search
|
||||
* @i_cno: checkpoint number for GC inode
|
||||
* @i_btnode_cache: cached pages of b-tree nodes
|
||||
* @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
|
||||
* @i_dirty: list for connecting dirty files
|
||||
* @xattr_sem: semaphore for extended attributes processing
|
||||
* @i_bh: buffer contains disk inode
|
||||
|
@ -43,7 +43,7 @@ struct nilfs_inode_info {
|
|||
__u64 i_xattr; /* sector_t ??? */
|
||||
__u32 i_dir_start_lookup;
|
||||
__u64 i_cno; /* check point number for GC inode */
|
||||
struct address_space i_btnode_cache;
|
||||
struct inode *i_assoc_inode;
|
||||
struct list_head i_dirty; /* List for connecting dirty files */
|
||||
|
||||
#ifdef CONFIG_NILFS_XATTR
|
||||
|
@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap)
|
|||
return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
|
||||
}
|
||||
|
||||
static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
|
||||
{
|
||||
struct nilfs_inode_info *ii =
|
||||
container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
|
||||
return &ii->vfs_inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dynamic state flags of NILFS on-memory inode (i_state)
|
||||
*/
|
||||
|
@ -98,6 +91,8 @@ enum {
|
|||
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
|
||||
NILFS_I_BMAP, /* has bmap and btnode_cache */
|
||||
NILFS_I_GCINODE, /* inode for GC, on memory only */
|
||||
NILFS_I_BTNC, /* inode for btree node cache */
|
||||
NILFS_I_SHADOW, /* inode for shadowed page cache */
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -267,6 +262,9 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
|
|||
unsigned long ino);
|
||||
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
|
||||
unsigned long ino, __u64 cno);
|
||||
int nilfs_attach_btree_node_cache(struct inode *inode);
|
||||
void nilfs_detach_btree_node_cache(struct inode *inode);
|
||||
struct inode *nilfs_iget_for_shadow(struct inode *inode);
|
||||
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
|
||||
extern void nilfs_truncate(struct inode *);
|
||||
extern void nilfs_evict_inode(struct inode *);
|
||||
|
|
|
@ -436,22 +436,12 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page,
|
|||
return nc;
|
||||
}
|
||||
|
||||
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
|
||||
{
|
||||
mapping->host = inode;
|
||||
mapping->flags = 0;
|
||||
mapping_set_gfp_mask(mapping, GFP_NOFS);
|
||||
mapping->private_data = NULL;
|
||||
mapping->a_ops = &empty_aops;
|
||||
}
|
||||
|
||||
/*
|
||||
* NILFS2 needs clear_page_dirty() in the following two cases:
|
||||
*
|
||||
* 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
|
||||
* page dirty flags when it copies back pages from the shadow cache
|
||||
* (gcdat->{i_mapping,i_btnode_cache}) to its original cache
|
||||
* (dat->{i_mapping,i_btnode_cache}).
|
||||
* 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty
|
||||
* flag of pages when it copies back pages from shadow cache to the
|
||||
* original cache.
|
||||
*
|
||||
* 2) Some B-tree operations like insertion or deletion may dispose buffers
|
||||
* in dirty state, and this needs to cancel the dirty state of their pages.
|
||||
|
|
|
@ -43,7 +43,6 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
|
|||
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
|
||||
void nilfs_clear_dirty_page(struct page *, bool);
|
||||
void nilfs_clear_dirty_pages(struct address_space *, bool);
|
||||
void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
|
||||
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
|
||||
unsigned int);
|
||||
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
|
||||
|
|
|
@ -733,15 +733,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
|
|||
struct list_head *listp)
|
||||
{
|
||||
struct nilfs_inode_info *ii = NILFS_I(inode);
|
||||
struct address_space *mapping = &ii->i_btnode_cache;
|
||||
struct inode *btnc_inode = ii->i_assoc_inode;
|
||||
struct pagevec pvec;
|
||||
struct buffer_head *bh, *head;
|
||||
unsigned int i;
|
||||
pgoff_t index = 0;
|
||||
|
||||
if (!btnc_inode)
|
||||
return;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
|
||||
while (pagevec_lookup_tag(&pvec, mapping, &index,
|
||||
while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index,
|
||||
PAGECACHE_TAG_DIRTY)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
bh = head = page_buffers(pvec.pages[i]);
|
||||
|
@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
|
|||
continue;
|
||||
list_del_init(&ii->i_dirty);
|
||||
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
|
||||
nilfs_btnode_cache_clear(&ii->i_btnode_cache);
|
||||
nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
|
||||
iput(&ii->vfs_inode);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -157,7 +157,8 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
|
|||
ii->i_bh = NULL;
|
||||
ii->i_state = 0;
|
||||
ii->i_cno = 0;
|
||||
nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
|
||||
ii->i_assoc_inode = NULL;
|
||||
ii->i_bmap = &ii->i_bmap_data;
|
||||
return &ii->vfs_inode;
|
||||
}
|
||||
|
||||
|
@ -1377,8 +1378,6 @@ static void nilfs_inode_init_once(void *obj)
|
|||
#ifdef CONFIG_NILFS_XATTR
|
||||
init_rwsem(&ii->xattr_sem);
|
||||
#endif
|
||||
address_space_init_once(&ii->i_btnode_cache);
|
||||
ii->i_bmap = &ii->i_bmap_data;
|
||||
inode_init_once(&ii->vfs_inode);
|
||||
}
|
||||
|
||||
|
|
|
@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
|
|||
/* Read information header from global quota file */
|
||||
int ocfs2_global_read_info(struct super_block *sb, int type)
|
||||
{
|
||||
struct inode *gqinode = NULL;
|
||||
unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
|
||||
GROUP_QUOTA_SYSTEM_INODE };
|
||||
struct ocfs2_global_disk_dqinfo dinfo;
|
||||
|
@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
|
|||
u64 pcount;
|
||||
int status;
|
||||
|
||||
oinfo->dqi_gi.dqi_sb = sb;
|
||||
oinfo->dqi_gi.dqi_type = type;
|
||||
ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
|
||||
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
|
||||
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
|
||||
oinfo->dqi_gqi_bh = NULL;
|
||||
oinfo->dqi_gqi_count = 0;
|
||||
|
||||
/* Read global header */
|
||||
gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
|
||||
oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
|
||||
OCFS2_INVALID_SLOT);
|
||||
if (!gqinode) {
|
||||
if (!oinfo->dqi_gqinode) {
|
||||
mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
|
||||
type);
|
||||
status = -EINVAL;
|
||||
goto out_err;
|
||||
}
|
||||
oinfo->dqi_gi.dqi_sb = sb;
|
||||
oinfo->dqi_gi.dqi_type = type;
|
||||
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
|
||||
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
|
||||
oinfo->dqi_gqi_bh = NULL;
|
||||
oinfo->dqi_gqi_count = 0;
|
||||
oinfo->dqi_gqinode = gqinode;
|
||||
|
||||
status = ocfs2_lock_global_qf(oinfo, 0);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
|
||||
status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
|
||||
&pcount, NULL);
|
||||
if (status < 0)
|
||||
goto out_unlock;
|
||||
|
|
|
@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
|
|||
info->dqi_priv = oinfo;
|
||||
oinfo->dqi_type = type;
|
||||
INIT_LIST_HEAD(&oinfo->dqi_chunk);
|
||||
oinfo->dqi_gqinode = NULL;
|
||||
ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
|
||||
oinfo->dqi_rec = NULL;
|
||||
oinfo->dqi_lqi_bh = NULL;
|
||||
oinfo->dqi_libh = NULL;
|
||||
|
|
|
@ -264,9 +264,7 @@ struct vm_area_struct;
|
|||
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
|
||||
|
||||
/* Room for N __GFP_FOO bits */
|
||||
#define __GFP_BITS_SHIFT (24 + \
|
||||
3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \
|
||||
IS_ENABLED(CONFIG_LOCKDEP))
|
||||
#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
|
||||
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
|
||||
|
||||
/**
|
||||
|
|
|
@ -1019,12 +1019,15 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
|
|||
struct damos *s;
|
||||
unsigned long wait_time;
|
||||
unsigned long min_wait_time = 0;
|
||||
bool init_wait_time = false;
|
||||
|
||||
while (!kdamond_need_stop(ctx)) {
|
||||
damon_for_each_scheme(s, ctx) {
|
||||
wait_time = damos_wmark_wait_us(s);
|
||||
if (!min_wait_time || wait_time < min_wait_time)
|
||||
if (!init_wait_time || wait_time < min_wait_time) {
|
||||
init_wait_time = true;
|
||||
min_wait_time = wait_time;
|
||||
}
|
||||
}
|
||||
if (!min_wait_time)
|
||||
return 0;
|
||||
|
|
10
mm/gup.c
10
mm/gup.c
|
@ -1404,6 +1404,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
|||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long nr_pages = (end - start) / PAGE_SIZE;
|
||||
int gup_flags;
|
||||
long ret;
|
||||
|
||||
VM_BUG_ON(!PAGE_ALIGNED(start));
|
||||
VM_BUG_ON(!PAGE_ALIGNED(end));
|
||||
|
@ -1438,8 +1439,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
|
|||
* We made sure addr is within a VMA, so the following will
|
||||
* not result in a stack expansion that recurses back here.
|
||||
*/
|
||||
return __get_user_pages(mm, start, nr_pages, gup_flags,
|
||||
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
|
||||
NULL, NULL, locked);
|
||||
lru_add_drain();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1471,6 +1474,7 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
|
|||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long nr_pages = (end - start) / PAGE_SIZE;
|
||||
int gup_flags;
|
||||
long ret;
|
||||
|
||||
VM_BUG_ON(!PAGE_ALIGNED(start));
|
||||
VM_BUG_ON(!PAGE_ALIGNED(end));
|
||||
|
@ -1498,8 +1502,10 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
|
|||
if (check_vma_flags(vma, gup_flags))
|
||||
return -EINVAL;
|
||||
|
||||
return __get_user_pages(mm, start, nr_pages, gup_flags,
|
||||
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
|
||||
NULL, NULL, locked);
|
||||
lru_add_drain();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -456,7 +456,8 @@ static inline void munlock_vma_page(struct page *page,
|
|||
}
|
||||
void mlock_new_page(struct page *page);
|
||||
bool need_mlock_page_drain(int cpu);
|
||||
void mlock_page_drain(int cpu);
|
||||
void mlock_page_drain_local(void);
|
||||
void mlock_page_drain_remote(int cpu);
|
||||
|
||||
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
|
||||
|
||||
|
@ -539,7 +540,8 @@ static inline void munlock_vma_page(struct page *page,
|
|||
struct vm_area_struct *vma, bool compound) { }
|
||||
static inline void mlock_new_page(struct page *page) { }
|
||||
static inline bool need_mlock_page_drain(int cpu) { return false; }
|
||||
static inline void mlock_page_drain(int cpu) { }
|
||||
static inline void mlock_page_drain_local(void) { }
|
||||
static inline void mlock_page_drain_remote(int cpu) { }
|
||||
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -566,6 +566,8 @@ static unsigned long kfence_init_pool(void)
|
|||
* enters __slab_free() slow-path.
|
||||
*/
|
||||
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
|
||||
struct slab *slab = page_slab(&pages[i]);
|
||||
|
||||
if (!i || (i % 2))
|
||||
continue;
|
||||
|
||||
|
@ -573,7 +575,11 @@ static unsigned long kfence_init_pool(void)
|
|||
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
|
||||
return addr;
|
||||
|
||||
__SetPageSlab(&pages[i]);
|
||||
__folio_set_slab(slab_folio(slab));
|
||||
#ifdef CONFIG_MEMCG
|
||||
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
|
||||
MEMCG_DATA_OBJCGS;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1033,6 +1039,9 @@ void __kfence_free(void *addr)
|
|||
{
|
||||
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
KFENCE_WARN_ON(meta->objcg);
|
||||
#endif
|
||||
/*
|
||||
* If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
|
||||
* the object, as the object page may be recycled for other-typed
|
||||
|
|
|
@ -89,6 +89,9 @@ struct kfence_metadata {
|
|||
struct kfence_track free_track;
|
||||
/* For updating alloc_covered on frees. */
|
||||
u32 alloc_stack_hash;
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct obj_cgroup *objcg;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
|
||||
|
|
|
@ -796,6 +796,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
|
|||
unsigned long flags;
|
||||
struct kmemleak_object *object;
|
||||
struct kmemleak_scan_area *area = NULL;
|
||||
unsigned long untagged_ptr;
|
||||
unsigned long untagged_objp;
|
||||
|
||||
object = find_and_get_object(ptr, 1);
|
||||
if (!object) {
|
||||
|
@ -804,6 +806,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
|
|||
return;
|
||||
}
|
||||
|
||||
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
|
||||
untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
|
||||
|
||||
if (scan_area_cache)
|
||||
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
|
||||
|
||||
|
@ -815,8 +820,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
|
|||
goto out_unlock;
|
||||
}
|
||||
if (size == SIZE_MAX) {
|
||||
size = object->pointer + object->size - ptr;
|
||||
} else if (ptr + size > object->pointer + object->size) {
|
||||
size = untagged_objp + object->size - untagged_ptr;
|
||||
} else if (untagged_ptr + size > untagged_objp + object->size) {
|
||||
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
|
||||
dump_object_info(object);
|
||||
kmem_cache_free(scan_area_cache, area);
|
||||
|
|
|
@ -1464,16 +1464,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
|||
|
||||
while (iov_iter_count(&iter)) {
|
||||
iovec = iov_iter_iovec(&iter);
|
||||
/*
|
||||
* do_madvise returns ENOMEM if unmapped holes are present
|
||||
* in the passed VMA. process_madvise() is expected to skip
|
||||
* unmapped holes passed to it in the 'struct iovec' list
|
||||
* and not fail because of them. Thus treat -ENOMEM return
|
||||
* from do_madvise as valid and continue processing.
|
||||
*/
|
||||
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
|
||||
iovec.iov_len, behavior);
|
||||
if (ret < 0 && ret != -ENOMEM)
|
||||
if (ret < 0)
|
||||
break;
|
||||
iov_iter_advance(&iter, iovec.iov_len);
|
||||
}
|
||||
|
|
12
mm/memory.c
12
mm/memory.c
|
@ -3918,14 +3918,18 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
|
|||
return ret;
|
||||
|
||||
if (unlikely(PageHWPoison(vmf->page))) {
|
||||
struct page *page = vmf->page;
|
||||
vm_fault_t poisonret = VM_FAULT_HWPOISON;
|
||||
if (ret & VM_FAULT_LOCKED) {
|
||||
if (page_mapped(page))
|
||||
unmap_mapping_pages(page_mapping(page),
|
||||
page->index, 1, false);
|
||||
/* Retry if a clean page was removed from the cache. */
|
||||
if (invalidate_inode_page(vmf->page))
|
||||
poisonret = 0;
|
||||
unlock_page(vmf->page);
|
||||
if (invalidate_inode_page(page))
|
||||
poisonret = VM_FAULT_NOPAGE;
|
||||
unlock_page(page);
|
||||
}
|
||||
put_page(vmf->page);
|
||||
put_page(page);
|
||||
vmf->page = NULL;
|
||||
return poisonret;
|
||||
}
|
||||
|
|
|
@ -246,7 +246,7 @@ static bool remove_migration_pte(struct folio *folio,
|
|||
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
|
||||
}
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_page_drain(smp_processor_id());
|
||||
mlock_page_drain_local();
|
||||
|
||||
trace_remove_migration_pte(pvmw.address, pte_val(pte),
|
||||
compound_order(new));
|
||||
|
|
46
mm/mlock.c
46
mm/mlock.c
|
@ -28,7 +28,14 @@
|
|||
|
||||
#include "internal.h"
|
||||
|
||||
static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
|
||||
struct mlock_pvec {
|
||||
local_lock_t lock;
|
||||
struct pagevec vec;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = {
|
||||
.lock = INIT_LOCAL_LOCK(lock),
|
||||
};
|
||||
|
||||
bool can_do_mlock(void)
|
||||
{
|
||||
|
@ -203,18 +210,30 @@ static void mlock_pagevec(struct pagevec *pvec)
|
|||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
void mlock_page_drain(int cpu)
|
||||
void mlock_page_drain_local(void)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
|
||||
pvec = &per_cpu(mlock_pvec, cpu);
|
||||
local_lock(&mlock_pvec.lock);
|
||||
pvec = this_cpu_ptr(&mlock_pvec.vec);
|
||||
if (pagevec_count(pvec))
|
||||
mlock_pagevec(pvec);
|
||||
local_unlock(&mlock_pvec.lock);
|
||||
}
|
||||
|
||||
void mlock_page_drain_remote(int cpu)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
|
||||
WARN_ON_ONCE(cpu_online(cpu));
|
||||
pvec = &per_cpu(mlock_pvec.vec, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
mlock_pagevec(pvec);
|
||||
}
|
||||
|
||||
bool need_mlock_page_drain(int cpu)
|
||||
{
|
||||
return pagevec_count(&per_cpu(mlock_pvec, cpu));
|
||||
return pagevec_count(&per_cpu(mlock_pvec.vec, cpu));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -223,7 +242,10 @@ bool need_mlock_page_drain(int cpu)
|
|||
*/
|
||||
void mlock_folio(struct folio *folio)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&mlock_pvec.lock);
|
||||
pvec = this_cpu_ptr(&mlock_pvec.vec);
|
||||
|
||||
if (!folio_test_set_mlocked(folio)) {
|
||||
int nr_pages = folio_nr_pages(folio);
|
||||
|
@ -236,7 +258,7 @@ void mlock_folio(struct folio *folio)
|
|||
if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
|
||||
folio_test_large(folio) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
local_unlock(&mlock_pvec.lock);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -245,9 +267,11 @@ void mlock_folio(struct folio *folio)
|
|||
*/
|
||||
void mlock_new_page(struct page *page)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
struct pagevec *pvec;
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
local_lock(&mlock_pvec.lock);
|
||||
pvec = this_cpu_ptr(&mlock_pvec.vec);
|
||||
SetPageMlocked(page);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
|
||||
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
||||
|
@ -256,7 +280,7 @@ void mlock_new_page(struct page *page)
|
|||
if (!pagevec_add(pvec, mlock_new(page)) ||
|
||||
PageHead(page) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
local_unlock(&mlock_pvec.lock);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -265,8 +289,10 @@ void mlock_new_page(struct page *page)
|
|||
*/
|
||||
void munlock_page(struct page *page)
|
||||
{
|
||||
struct pagevec *pvec = &get_cpu_var(mlock_pvec);
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&mlock_pvec.lock);
|
||||
pvec = this_cpu_ptr(&mlock_pvec.vec);
|
||||
/*
|
||||
* TestClearPageMlocked(page) must be left to __munlock_page(),
|
||||
* which will check whether the page is multiply mlocked.
|
||||
|
@ -276,7 +302,7 @@ void munlock_page(struct page *page)
|
|||
if (!pagevec_add(pvec, page) ||
|
||||
PageHead(page) || lru_cache_disabled())
|
||||
mlock_pagevec(pvec);
|
||||
put_cpu_var(mlock_pvec);
|
||||
local_unlock(&mlock_pvec.lock);
|
||||
}
|
||||
|
||||
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
|
||||
|
|
|
@ -8367,6 +8367,7 @@ static int page_alloc_cpu_dead(unsigned int cpu)
|
|||
struct zone *zone;
|
||||
|
||||
lru_add_drain_cpu(cpu);
|
||||
mlock_page_drain_remote(cpu);
|
||||
drain_pages(cpu);
|
||||
|
||||
/*
|
||||
|
|
|
@ -1683,7 +1683,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
*/
|
||||
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_page_drain(smp_processor_id());
|
||||
mlock_page_drain_local();
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
|
@ -1961,7 +1961,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
*/
|
||||
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_page_drain(smp_processor_id());
|
||||
mlock_page_drain_local();
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
|
|
|
@ -624,7 +624,6 @@ void lru_add_drain_cpu(int cpu)
|
|||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
|
||||
|
||||
activate_page_drain(cpu);
|
||||
mlock_page_drain(cpu);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -706,6 +705,7 @@ void lru_add_drain(void)
|
|||
local_lock(&lru_pvecs.lock);
|
||||
lru_add_drain_cpu(smp_processor_id());
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
mlock_page_drain_local();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -720,6 +720,7 @@ static void lru_add_and_bh_lrus_drain(void)
|
|||
lru_add_drain_cpu(smp_processor_id());
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
invalidate_bh_lrus_cpu();
|
||||
mlock_page_drain_local();
|
||||
}
|
||||
|
||||
void lru_add_drain_cpu_zone(struct zone *zone)
|
||||
|
@ -728,6 +729,7 @@ void lru_add_drain_cpu_zone(struct zone *zone)
|
|||
lru_add_drain_cpu(smp_processor_id());
|
||||
drain_local_pages(zone);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
mlock_page_drain_local();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
|
|
@ -441,7 +441,6 @@ static void usage(void)
|
|||
"-n\t\tSort by task command name.\n"
|
||||
"-a\t\tSort by memory allocate time.\n"
|
||||
"-r\t\tSort by memory release time.\n"
|
||||
"-c\t\tCull by comparing stacktrace instead of total block.\n"
|
||||
"-f\t\tFilter out the information of blocks whose memory has been released.\n"
|
||||
"--pid <PID>\tSelect by pid. This selects the information of blocks whose process ID number equals to <PID>.\n"
|
||||
"--tgid <TGID>\tSelect by tgid. This selects the information of blocks whose Thread Group ID number equals to <TGID>.\n"
|
||||
|
@ -466,14 +465,11 @@ int main(int argc, char **argv)
|
|||
{ 0, 0, 0, 0},
|
||||
};
|
||||
|
||||
while ((opt = getopt_long(argc, argv, "acfmnprstP", longopts, NULL)) != -1)
|
||||
while ((opt = getopt_long(argc, argv, "afmnprstP", longopts, NULL)) != -1)
|
||||
switch (opt) {
|
||||
case 'a':
|
||||
cmp = compare_ts;
|
||||
break;
|
||||
case 'c':
|
||||
cull = cull | CULL_STACKTRACE;
|
||||
break;
|
||||
case 'f':
|
||||
filter = filter | FILTER_UNRELEASE;
|
||||
break;
|
||||
|
|
Loading…
Reference in a new issue