erofs: fix infinite loop due to a race of filling compressed_bvecs

I encountered a race issue after lengthy (~594647 secs) stress tests on
a 64k-page arm64 VM with several 4k-block EROFS images.  The timing
is like below:

z_erofs_try_inplace_io                  z_erofs_fill_bio_vec
  cmpxchg(&compressed_bvecs[].page,
          NULL, ..)
                                        [access bufvec]
  compressed_bvecs[] = *bvec;

Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
other fields in bufvec didn't matter.  After the subpage block support
is landed, .offset and .end can be used too, but filling bufvec isn't
an atomic operation which can cause inconsistency.

Let's use a spinlock to keep the atomicity of each bufvec.  More
specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
since it's rarely used (also it takes a short time if even used) as long
as the pcluster has a reference.

Fixes: 192351616a ("erofs: support I/O submission for sub-page compressed blocks")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Link: https://lore.kernel.org/r/20240125120039.3228103-1-hsiangkao@linux.alibaba.com
This commit is contained in:
Gao Xiang 2024-01-25 20:00:39 +08:00
parent 97cf5d53b4
commit cc4b2dd95f
1 changed files with 38 additions and 36 deletions

View File

@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
if (i_blocksize(fe->inode) != PAGE_SIZE)
return;
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
if (i_blocksize(fe->inode) != PAGE_SIZE ||
fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
/* the compressed page was loaded before */
/* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
spin_lock(&pcl->obj.lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
pcl->compressed_bvecs[i].page = t;
spin_unlock(&pcl->obj.lockref.lock);
continue;
}
spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
return 0;
}
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec)
{
struct z_erofs_pcluster *const pcl = fe->pcl;
while (fe->icur > 0) {
if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
NULL, bvec->page)) {
pcl->compressed_bvecs[fe->icur] = *bvec;
return true;
}
}
return false;
}
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
if (z_erofs_try_inplace_io(fe, bvec))
spin_lock(&pcl->obj.lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
spin_unlock(&pcl->obj.lockref.lock);
return 0;
}
spin_unlock(&pcl->obj.lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
struct z_erofs_bvec zbv;
struct address_space *mapping;
struct page *page, *oldpage;
struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
oldpage = READ_ONCE(zbv->page);
if (!oldpage)
spin_lock(&pcl->obj.lockref.lock);
zbv = pcl->compressed_bvecs[nr];
page = zbv.page;
justfound = (unsigned long)page & 1UL;
page = (struct page *)((unsigned long)page & ~1UL);
pcl->compressed_bvecs[nr].page = page;
spin_unlock(&pcl->obj.lockref.lock);
if (!page)
goto out_allocpage;
justfound = (unsigned long)oldpage & 1UL;
page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@ -1448,7 +1447,6 @@ repeat:
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@ -1459,9 +1457,9 @@ repeat:
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
if (zbv->offset < 0)
bvec->bv_offset = round_up(-zbv->offset, bs);
bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
if (zbv.offset < 0)
bvec->bv_offset = round_up(-zbv.offset, bs);
bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@ -1471,7 +1469,6 @@ repeat:
/* the cached page is still in managed cache */
if (page->mapping == mc) {
WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@ -1503,11 +1500,15 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
spin_lock(&pcl->obj.lockref.lock);
if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
pcl->compressed_bvecs[nr].page = page;
spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@ -1685,6 +1686,7 @@ submit_bio_retry:
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;