bcachefs fixes for 6.9-rc6

- fix a few more deadlocks in recovery
 - fix u32/u64 issues in mi_btree_bitmap
 - btree key cache shrinker now actually frees, with more instrumentation
   coming so we can verify that it's working correctly more easily in the
   future
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmYmveYACgkQE6szbY3K
 bnY+kA//dtdKfPliuQlfYhUvIZSvgEy+KxjDaDmeJFVMKFYHiip/JJt7V/YU7jWW
 DmWGtPqo1hoGZnic7h9fstbBCRgUdIYGBInqAWPzL3wmFYe5FPE02KN5fuZ+A+Mp
 sn0QFML4oA0uxD7TRXfCeNx3NRwXonztletVskXCuLa0T8iTOTdOuAH0MCGow3OC
 mlfRZyK05f6Q0UOIzvntfBl8Tkr+yk5g0GFz54U7s+qs/zJsYiYfruXuq6AxLTy2
 xVMDj3Hc6W0vggVpv68HInluubl/b7rVSy+w59GG0D3iQ9/fBdqitFLdw49ReXzi
 J//ctZLb3n+IM4lA7t5ev0lY7bvI2FwFNkrL4qW41E4un5eQ3ghbyOmMoz87svyg
 4JW/CPGP7uKNVmfRuHn1qhgJ9/vIXkObJVl9GKZF2BylaZwjMM5YrL5MZwrKFQYy
 9BMgemgvHFK+wRi74q/OUu3PyH045AoJdIKI66ypFexhGi5YeNqtRHLUKdfSrJR+
 eEkAUaHcgLLfxyk+fIRvcSK+Q9j3BibvsSkU3vLSnl2B+xdvfJqb+I/yvBt/SZQW
 P09JceDQABRBMu9beVbVqMED+PniSZVfG2eU2jBcZ+jhbGQgmiWfMNJVKS4/0uwz
 PRS33P2mViVZ6PJokWyecbgGtVxKrK2ruPdcu6/W05Qi0Vv58+k=
 =lCNd
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2024-04-22' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Nothing too crazy in this one, and it looks like (fingers crossed) the
  recovery and repair issues are settling down - although there's going
  to be a long tail there, as we've still yet to really ramp up on error
  injection or syzbot.

   - fix a few more deadlocks in recovery

   - fix u32/u64 issues in mi_btree_bitmap

   - btree key cache shrinker now actually frees, with more
     instrumentation coming so we can verify that it's working
     correctly more easily in the future"

* tag 'bcachefs-2024-04-22' of https://evilpiepirate.org/git/bcachefs:
  bcachefs: If we run merges at a lower watermark, they must be nonblocking
  bcachefs: Fix inode early destruction path
  bcachefs: Fix deadlock in journal write path
  bcachefs: Tweak btree key cache shrinker so it actually frees
  bcachefs: bkey_cached.btree_trans_barrier_seq needs to be a ulong
  bcachefs: Fix missing call to bch2_fs_allocator_background_exit()
  bcachefs: Check for journal entries overruning end of sb clean section
  bcachefs: Fix bio alloc in check_extent_checksum()
  bcachefs: fix leak in bch2_gc_write_reflink_key
  bcachefs: KEY_TYPE_error is allowed for reflink
  bcachefs: Fix bch2_dev_btree_bitmap_marked_sectors() shift
  bcachefs: make sure to release last journal pin in replay
  bcachefs: node scan: ignore multiple nodes with same seq if interior
  bcachefs: Fix format specifier in validate_bset_keys()
  bcachefs: Fix null ptr deref in twf from BCH_IOCTL_FSCK_OFFLINE
This commit is contained in:
Linus Torvalds 2024-04-22 13:53:50 -07:00
commit a2c63a3f3d
19 changed files with 105 additions and 52 deletions

View File

@ -470,7 +470,7 @@ found:
goto err;
}
bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = p.ptr.offset;
bch2_bio_map(bio, data_buf, bytes);
ret = submit_bio_wait(bio);

View File

@ -1504,7 +1504,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_stripe)) \
x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
BIT_ULL(KEY_TYPE_reflink_v)| \
BIT_ULL(KEY_TYPE_indirect_inline_data)) \
BIT_ULL(KEY_TYPE_indirect_inline_data)| \
BIT_ULL(KEY_TYPE_error)) \
x(subvolumes, 8, 0, \
BIT_ULL(KEY_TYPE_subvolume)) \
x(snapshots, 9, 0, \

View File

@ -1587,7 +1587,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
goto out;
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
@ -1595,6 +1595,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
ret = bch2_trans_update(trans, iter, new, 0);
}
out:
fsck_err:
printbuf_exit(&buf);
return ret;

View File

@ -888,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_u64s,
"bad k->u64s %u (min %u max %lu)", k->u64s,
"bad k->u64s %u (min %u max %zu)", k->u64s,
bkeyp_key_u64s(&b->format, k),
U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;

View File

@ -842,8 +842,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
scanned += bc->nr_freed_nonpcpu;
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -857,11 +855,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
scanned += bc->nr_freed_pcpu;
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -875,9 +868,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->nr_freed_pcpu--;
}
if (scanned >= nr)
goto out;
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
if (bc->shrink_iter >= tbl->size)
@ -893,12 +883,12 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
goto next;
if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
else if (bkey_cached_lock_for_evict(ck)) {
goto next;
} else if (bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
}
@ -916,7 +906,6 @@ next:
} while (scanned < nr && bc->shrink_iter != start);
rcu_read_unlock();
out:
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);

View File

@ -302,6 +302,8 @@ again:
start->max_key = bpos_predecessor(n->min_key);
start->range_updated = true;
} else if (n->level) {
n->overwritten = true;
} else {
struct printbuf buf = PRINTBUF;

View File

@ -321,9 +321,9 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
bool valid;
u32 btree_trans_barrier_seq;
struct bkey_cached_key key;
struct rhash_head hash;

View File

@ -1960,7 +1960,11 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
return 0;
flags &= ~BCH_WATERMARK_MASK;
if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) {
flags &= ~BCH_WATERMARK_MASK;
flags |= BCH_WATERMARK_btree;
flags |= BCH_TRANS_COMMIT_journal_reclaim;
}
b = trans->paths[path].l[level].b;

View File

@ -232,13 +232,15 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
if (!IS_ERR(thr->c) &&
thr->c->opts.errors == BCH_ON_ERROR_panic)
thr->c->opts.errors = BCH_ON_ERROR_ro;
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
ret = __bch2_run_thread_with_stdio(&thr->thr);
out:
darray_for_each(devs, i)
kfree(*i);

View File

@ -188,7 +188,8 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
BUG_ON(!old);
if (unlikely(old != inode)) {
discard_new_inode(&inode->v);
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
@ -225,8 +226,10 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
if (unlikely(!inode)) {
int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
if (ret && inode)
discard_new_inode(&inode->v);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
}
if (ret)
return ERR_PTR(ret);
}

View File

@ -1723,7 +1723,7 @@ static void journal_write_endio(struct bio *bio)
percpu_ref_put(&ca->io_ref);
}
static CLOSURE_CALLBACK(do_journal_write)
static CLOSURE_CALLBACK(journal_write_submit)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
@ -1768,6 +1768,44 @@ static CLOSURE_CALLBACK(do_journal_write)
continue_at(cl, journal_write_done, j->wq);
}
static CLOSURE_CALLBACK(journal_write_preflush)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
closure_wait(&j->async_wait, cl);
spin_unlock(&j->lock);
continue_at(cl, journal_write_preflush, j->wq);
return;
}
if (w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
continue_at(cl, journal_write_submit, j->wq);
} else {
/*
* no need to punt to another work item if we're not waiting on
* preflushes
*/
journal_write_submit(&cl->work);
}
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
@ -2033,23 +2071,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data))
closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
continue_at(cl, do_journal_write, j->wq);
continue_at(cl, journal_write_preflush, j->wq);
else
continue_at(cl, journal_write_submit, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, j->wq);

View File

@ -249,7 +249,10 @@ int bch2_journal_replay(struct bch_fs *c)
struct journal_key *k = *kp;
replay_now_at(j, k->journal_seq);
if (k->journal_seq)
replay_now_at(j, k->journal_seq);
else
replay_now_at(j, j->replay_journal_seq_end);
ret = commit_do(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc|

View File

@ -29,6 +29,14 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle
for (entry = clean->start;
entry < (struct jset_entry *) vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
if (vstruct_end(entry) > vstruct_end(&clean->field)) {
bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu",
le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s),
(u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field));
bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun);
return -BCH_ERR_fsck_repair_unimplemented;
}
ret = bch2_journal_entry_validate(c, NULL, entry,
le16_to_cpu(c->disk_sb.sb->version),
BCH_SB_BIG_ENDIAN(c->disk_sb.sb),

View File

@ -271,7 +271,8 @@
x(btree_root_unreadable_and_scan_found_nothing, 263) \
x(snapshot_node_missing, 264) \
x(dup_backpointer_to_bad_csum_extent, 265) \
x(btree_bitmap_not_marked, 266)
x(btree_bitmap_not_marked, 266) \
x(sb_clean_entry_overrun, 267)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,

View File

@ -463,8 +463,8 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns
m->btree_bitmap_shift += resize;
}
for (unsigned bit = sectors >> m->btree_bitmap_shift;
bit << m->btree_bitmap_shift < end;
for (unsigned bit = start >> m->btree_bitmap_shift;
(u64) bit << m->btree_bitmap_shift < end;
bit++)
bitmap |= BIT_ULL(bit);

View File

@ -235,11 +235,11 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64
{
u64 end = start + sectors;
if (end > 64 << ca->mi.btree_bitmap_shift)
if (end > 64ULL << ca->mi.btree_bitmap_shift)
return false;
for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift;
bit << ca->mi.btree_bitmap_shift < end;
for (unsigned bit = start >> ca->mi.btree_bitmap_shift;
(u64) bit << ca->mi.btree_bitmap_shift < end;
bit++)
if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
return false;

View File

@ -544,6 +544,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_fs_allocator_background_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);

View File

@ -294,16 +294,27 @@ static int thread_with_stdio_fn(void *arg)
return 0;
}
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
void bch2_thread_with_stdio_init(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
}
int __bch2_run_thread_with_stdio(struct thread_with_stdio *thr)
{
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
}
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
bch2_thread_with_stdio_init(thr, ops);
return __bch2_run_thread_with_stdio(thr);
}
int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{

View File

@ -63,6 +63,9 @@ struct thread_with_stdio {
const struct thread_with_stdio_ops *ops;
};
void bch2_thread_with_stdio_init(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int __bch2_run_thread_with_stdio(struct thread_with_stdio *);
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_run_thread_with_stdout(struct thread_with_stdio *,