mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig

Convert generic_file_buffered_read() to get pages to read from in batches,
and then copy data to userspace from many pages at once - in particular,
we now don't touch any cachelines that might be contended while we're in
the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance improvement
(1%, within the margin of error).

akpm: kernel test robot found a 32% speedup on one test:
https://lkml.kernel.org/r/20201030081456.GY31092@shao2-debian

Link: https://lkml.kernel.org/r/20201025212949.602194-3-kent.overstreet@gmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Kent Overstreet 2020-12-14 19:04:56 -08:00 committed by Linus Torvalds
parent 723ef24b9b
commit 06c0444290
1 changed files with 177 additions and 140 deletions

View File

@ -2176,67 +2176,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
return lock_page_killable(page);
}
static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
struct iov_iter *iter,
struct page *page)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &iocb->ki_filp->f_ra;
unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
unsigned int bytes, copied;
loff_t isize, end_offset;
BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "bytes", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
return 1;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
copied = copy_page_to_iter(page, offset, bytes, iter);
iocb->ki_pos += copied;
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(page);
ra->prev_pos = iocb->ki_pos;
if (copied < bytes)
return -EFAULT;
return !iov_iter_count(iter) || iocb->ki_pos == isize;
}
static struct page *
generic_file_buffered_read_readpage(struct kiocb *iocb,
struct file *filp,
@ -2394,6 +2333,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
}
static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
struct iov_iter *iter,
struct page **pages,
unsigned int nr)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
int i, j, nr_got, err = 0;
nr = min_t(unsigned long, last_index - index, nr);
find_page:
if (fatal_signal_pending(current))
return -EINTR;
nr_got = find_get_pages_contig(mapping, index, nr, pages);
if (nr_got)
goto got_pages;
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
nr_got = find_get_pages_contig(mapping, index, nr, pages);
if (nr_got)
goto got_pages;
pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
err = PTR_ERR_OR_ZERO(pages[0]);
if (!IS_ERR_OR_NULL(pages[0]))
nr_got = 1;
got_pages:
for (i = 0; i < nr_got; i++) {
struct page *page = pages[i];
pgoff_t pg_index = index + i;
loff_t pg_pos = max(iocb->ki_pos,
(loff_t) pg_index << PAGE_SHIFT);
loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
if (PageReadahead(page)) {
if (iocb->ki_flags & IOCB_NOIO) {
for (j = i; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = -EAGAIN;
break;
}
page_cache_async_readahead(mapping, ra, filp, page,
pg_index, last_index - pg_index);
}
if (!PageUptodate(page)) {
if ((iocb->ki_flags & IOCB_NOWAIT) ||
((iocb->ki_flags & IOCB_WAITQ) && i)) {
for (j = i; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = -EAGAIN;
break;
}
page = generic_file_buffered_read_pagenotuptodate(iocb,
filp, iter, page, pg_pos, pg_count);
if (IS_ERR_OR_NULL(page)) {
for (j = i + 1; j < nr_got; j++)
put_page(pages[j]);
nr_got = i;
err = PTR_ERR_OR_ZERO(page);
break;
}
}
}
if (likely(nr_got))
return nr_got;
if (err)
return err;
/*
* No pages and no error means we raced and should retry:
*/
goto find_page;
}
/**
* generic_file_buffered_read - generic file read routine
* @iocb: the iocb to read
@ -2414,104 +2439,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
size_t orig_count = iov_iter_count(iter);
pgoff_t last_index;
int error = 0;
struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
unsigned int nr_pages = min_t(unsigned int, 512,
((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
(iocb->ki_pos >> PAGE_SHIFT));
int i, pg_nr, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
if (nr_pages > ARRAY_SIZE(pages_onstack))
pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if (written && (iocb->ki_flags & IOCB_WAITQ))
iocb->ki_flags |= IOCB_NOWAIT;
for (;;) {
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
struct page *page;
if (!pages) {
pages = pages_onstack;
nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
}
do {
cond_resched();
find_page:
if (fatal_signal_pending(current)) {
error = -EINTR;
goto out;
/*
* If we've already successfully copied some data, then we
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
if ((iocb->ki_flags & IOCB_WAITQ) && written)
iocb->ki_flags |= IOCB_NOWAIT;
i = 0;
pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
pages, nr_pages);
if (pg_nr < 0) {
error = pg_nr;
break;
}
/*
* We can't return -EIOCBQUEUED once we've done some work, so
* ensure we don't block:
* i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
if ((iocb->ki_flags & IOCB_WAITQ) &&
(written + orig_count - iov_iter_count(iter)))
iocb->ki_flags |= IOCB_NOWAIT;
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;
page = find_get_page(mapping, index);
if (!page) {
if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL)) {
page = generic_file_buffered_read_no_cached_page(iocb, iter);
if (!page)
goto find_page;
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto out;
}
}
}
if (PageReadahead(page)) {
if (iocb->ki_flags & IOCB_NOIO) {
put_page(page);
goto out;
}
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
put_page(page);
error = -EAGAIN;
goto out;
}
page = generic_file_buffered_read_pagenotuptodate(iocb,
filp, iter, page, iocb->ki_pos, iter->count);
if (!page)
goto find_page;
if (IS_ERR(page)) {
error = PTR_ERR(page);
goto out;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
(end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
put_page(pages[--pg_nr]);
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
writably_mapped = mapping_writably_mapped(mapping);
/*
* When a sequential read accesses a page several times, only
* mark it as accessed the first time.
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
mark_page_accessed(pages[0]);
for (i = 1; i < pg_nr; i++)
mark_page_accessed(pages[i]);
for (i = 0; i < pg_nr; i++) {
unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
PAGE_SIZE - offset);
unsigned int copied;
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (writably_mapped)
flush_dcache_page(pages[i]);
copied = copy_page_to_iter(pages[i], offset, bytes, iter);
written += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;
if (copied < bytes) {
error = -EFAULT;
break;
}
}
put_pages:
for (i = 0; i < pg_nr; i++)
put_page(pages[i]);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
error = generic_file_buffered_read_page_ok(iocb, iter, page);
put_page(page);
if (error) {
if (error > 0)
error = 0;
goto out;
}
}
would_block:
error = -EAGAIN;
out:
file_accessed(filp);
written += orig_count - iov_iter_count(iter);
if (pages != pages_onstack)
kfree(pages);
return written ? written : error;
}