diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index f6d86eb4b976..66ca0bbee639 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -17,6 +17,7 @@ bcachefs-y := \ btree_journal_iter.o \ btree_key_cache.o \ btree_locking.o \ + btree_node_scan.o \ btree_trans_commit.o \ btree_update.o \ btree_update_interior.o \ @@ -37,6 +38,7 @@ bcachefs-y := \ error.o \ extents.o \ extent_update.o \ + eytzinger.o \ fs.o \ fs-common.o \ fs-ioctl.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 893e38f9db80..4ff56fa4d539 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1713,34 +1713,37 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (a->v.dirty_sectors) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "attempting to discard bucket with dirty data\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = -EIO; - } goto out; } if (a->v.data_type != BCH_DATA_need_discard) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - bch2_trans_inconsistent(trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - ret = -EIO; + if (data_type_is_empty(a->v.data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; } + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; + goto out; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, + trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = -EIO; goto out; } @@ -1835,6 +1838,7 @@ static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpo if (ret) goto err; + BUG_ON(a->v.dirty_sectors); SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); @@ -1942,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, goto out; BUG_ON(a->v.data_type != BCH_DATA_cached); + BUG_ON(a->v.dirty_sectors); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 214b15c84d1f..a1fc30adf912 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { - case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; case BCH_WATERMARK_btree: case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index b91b7a461056..c2226e947c41 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -22,7 +22,8 @@ struct bucket_alloc_state { x(copygc) \ x(btree) \ x(btree_copygc) \ - x(reclaim) + x(reclaim) \ + x(interior_updates) enum bch_watermark { #define x(name) BCH_WATERMARK_##name, diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 762c8ddfc5e7..114328acde72 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -8,6 +8,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" +#include "checksum.h" #include "error.h" #include @@ -418,6 +419,84 @@ struct extents_to_bp_state { struct bkey_buf last_flushed; }; +static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, + struct bkey_s_c extent, unsigned dev) +{ + struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bch2_bkey_drop_device(bkey_i_to_s(n), dev); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +static int check_extent_checksum(struct btree_trans *trans, + enum btree_id btree, struct bkey_s_c extent, + enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; + int ret = 0; + + if (bkey_is_btree_ptr(extent.k)) + return false; + + bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) + if (p.ptr.dev == dev) + goto found; + BUG(); +found: + if (!p.crc.csum_type) + return false; + + bytes = p.crc.compressed_size << 9; + + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + data_buf = kvmalloc(bytes, GFP_KERNEL); + if (!data_buf) { + ret = -ENOMEM; + goto err; + } + + bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = p.ptr.offset; + bch2_bio_map(bio, data_buf, bytes); + ret = submit_bio_wait(bio); + if (ret) + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + bch2_bkey_val_to_text(&buf, c, extent); + prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->version, p.crc); + struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); + if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), + c, dup_backpointer_to_bad_csum_extent, + "%s", buf.buf)) + ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; +fsck_err: +err: + if (bio) + bio_put(bio); + kvfree(data_buf); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, struct bpos bucket, @@ -425,7 +504,8 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; + struct btree_iter bp_iter = {}; + struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; struct bkey_buf tmp; @@ -433,13 +513,19 @@ static int check_bp_exists(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "extent for nonexistent device:bucket "); + bch2_bpos_to_text(&buf, bucket); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + bch_err(c, "%s", buf.buf); + return -BCH_ERR_fsck_repair_unimplemented; + } + if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) return 0; - if (!bch2_dev_bucket_exists(c, bucket)) - goto missing; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp(c, bucket, bp.bucket_offset), 0); @@ -465,21 +551,94 @@ static int check_bp_exists(struct btree_trans *trans, ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } - goto missing; + + goto check_existing_bp; } out: err: fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; +check_existing_bp: + /* Do we have a backpointer for a different extent? */ + if (bp_k.k->type != KEY_TYPE_backpointer) + goto missing; + + struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + + struct bkey_s_c other_extent = + bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; + if (ret) + goto err; + + if (!other_extent.k) + goto missing; + + if (bch2_extents_match(orig_k, other_extent)) { + printbuf_reset(&buf); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + + if (other_extent.k->size <= orig_k.k->size) { + ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + if (ret) + goto err; + goto out; + } else { + ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + if (ret) + goto err; + goto missing; + } + } + + ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto missing; + } + + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + if (ret < 0) + goto err; + if (ret) { + ret = 0; + goto out; + } + + printbuf_reset(&buf); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; missing: + printbuf_reset(&buf); prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_id_str(bp.btree_id), bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nbp pos "); - bch2_bpos_to_text(&buf, bp_iter.pos); + prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); + + struct bkey_i_backpointer n_bp_k; + bkey_backpointer_init(&n_bp_k.k_i); + n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + n_bp_k.v = bp; + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 963162a627cd..a31a5f706929 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -456,6 +456,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" +#include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" @@ -614,6 +615,7 @@ struct bch_dev { */ #define BCH_FS_FLAGS() \ + x(new_fs) \ x(started) \ x(may_go_rw) \ x(rw) \ @@ -796,6 +798,7 @@ struct bch_fs { u64 features; u64 compat; unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; + u64 btrees_lost_data; } sb; @@ -1103,6 +1106,8 @@ struct bch_fs { struct journal_keys journal_keys; struct list_head journal_iters; + struct find_btree_nodes found_btree_nodes; + u64 last_bucket_seq_cleanup; u64 counters_on_mount[BCH_COUNTER_NR]; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index bff8750ac0d7..63102992d955 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -818,6 +818,7 @@ struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; __le64 errors_silent[8]; + __le64 btrees_lost_data; }; struct bch_sb_field_downgrade_entry { diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e5d2c6daa663..6280da1244b5 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -13,6 +13,7 @@ #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" @@ -41,6 +42,7 @@ #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +#define DID_FILL_FROM_SCAN 12 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) { @@ -129,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) return -BCH_ERR_ENOMEM_gc_repair_key; @@ -154,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) struct bkey_i_btree_ptr_v2 *new; int ret; + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); if (ret) return ret; @@ -185,127 +209,138 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) return 0; } -static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, - struct btree *prev, struct btree *cur) +static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur, + struct bpos *pulled_from_scan) { struct bpos expected_start = !prev ? b->data->min_key : bpos_successor(prev->key.k.p); - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - if (!prev) { - prt_printf(&buf1, "start of node: "); - bch2_bpos_to_text(&buf1, b->data->min_key); - } else { - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + + prt_printf(&buf, " at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { + prt_printf(&buf, "\n prev: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + prt_str(&buf, "\n next: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); - if (prev && - bpos_gt(expected_start, cur->data->min_key) && - BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { - /* cur overwrites prev: */ + if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, cur->data->min_key)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) + goto err; - if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, - cur->data->min_key), c, - btree_node_topology_overwritten_by_next_node, - "btree node overwritten by next node at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_PREV_NODE; - goto out; + *pulled_from_scan = cur->data->min_key; + ret = DID_FILL_FROM_SCAN; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); } - - if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, - bpos_predecessor(cur->data->min_key)), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " node %s\n" - " next %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_max(c, prev, - bpos_predecessor(cur->data->min_key)); - } else { - /* prev overwrites cur: */ - - if (mustfix_fsck_err_on(bpos_ge(expected_start, - cur->data->max_key), c, - btree_node_topology_overwritten_by_prev_node, - "btree node overwritten by prev node at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = DROP_THIS_NODE; - goto out; + } else { /* overlap */ + if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */ + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) + ret = DROP_PREV_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } + } else { + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) + ret = DROP_THIS_NODE; + } else { + if (mustfix_fsck_err(c, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) + ret = set_node_min(c, cur, expected_start); + } } - - if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, - btree_node_topology_bad_min_key, - "btree node with incorrect min_key at btree %s level %u:\n" - " prev %s\n" - " node %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) - ret = set_node_min(c, cur, expected_start); } -out: +err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } static int btree_repair_node_end(struct bch_fs *c, struct btree *b, - struct btree *child) + struct btree *child, struct bpos *pulled_from_scan) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + struct printbuf buf = PRINTBUF; int ret = 0; - bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); - bch2_bpos_to_text(&buf2, b->key.k.p); + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; - if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, - btree_node_topology_bad_max_key, - "btree node with incorrect max_key at btree %s level %u:\n" - " %s\n" - " expected %s", - bch2_btree_id_str(b->c.btree_id), b->c.level, - buf1.buf, buf2.buf)) { - ret = set_node_max(c, child, b->key.k.p); - if (ret) - goto err; + prt_printf(&buf, "at btree %s level %u:\n parent: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); + + if (mustfix_fsck_err(c, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) { + if (b->c.level == 1 && + bpos_lt(*pulled_from_scan, b->key.k.p)) { + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) + goto err; + + *pulled_from_scan = b->key.k.p; + ret = DID_FILL_FROM_SCAN; + } else { + ret = set_node_max(c, child, b->key.k.p); + } } err: fsck_err: - printbuf_exit(&buf2); - printbuf_exit(&buf1); + printbuf_exit(&buf); return ret; } -static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b, + struct bpos *pulled_from_scan) { struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf prev_k, cur_k; struct btree *prev = NULL, *cur = NULL; - bool have_child, dropped_children = false; + bool have_child, new_pass = false; struct printbuf buf = PRINTBUF; int ret = 0; if (!b->c.level) return 0; -again: - prev = NULL; - have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); +again: + cur = prev = NULL; + have_child = new_pass = false; bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -332,9 +367,10 @@ again: b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); - ret = bch2_journal_key_delete(c, b->c.btree_id, - b->c.level, cur_k.k->k.p); cur = NULL; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: + bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); if (ret) break; continue; @@ -344,7 +380,23 @@ again: if (ret) break; - ret = btree_repair_node_boundaries(c, b, prev, cur); + if (bch2_btree_node_is_stale(c, cur)) { + bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } if (ret == DROP_THIS_NODE) { six_unlock_read(&cur->c.lock); @@ -370,8 +422,6 @@ again: break; bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); goto again; } else if (ret) break; @@ -383,7 +433,11 @@ again: if (!ret && !IS_ERR_OR_NULL(prev)) { BUG_ON(cur); - ret = btree_repair_node_end(c, b, prev); + ret = btree_repair_node_end(c, b, prev, pulled_from_scan); + if (ret == DID_FILL_FROM_SCAN) { + new_pass = true; + ret = 0; + } } if (!IS_ERR_OR_NULL(prev)) @@ -397,6 +451,10 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); + + if (new_pass) + goto again; + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); iter.prefetch = true; @@ -413,7 +471,7 @@ again: if (ret) goto err; - ret = bch2_btree_repair_topology_recurse(trans, cur); + ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan); six_unlock_read(&cur->c.lock); cur = NULL; @@ -421,7 +479,7 @@ again: bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); - dropped_children = true; + new_pass = true; } if (ret) @@ -448,12 +506,14 @@ fsck_err: six_unlock_read(&cur->c.lock); bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&prev_k, c); - bch2_bkey_buf_exit(&cur_k, c); - if (!ret && dropped_children) + if (!ret && new_pass) goto again; + BUG_ON(!ret && bch2_btree_node_check_topology(trans, b)); + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); printbuf_exit(&buf); return ret; } @@ -461,32 +521,63 @@ fsck_err: int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct btree *b; - unsigned i; + struct bpos pulled_from_scan = POS_MIN; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; - if (!r->alive) - continue; + if (r->error) { + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + break; +reconstruct_root: + bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); - b = r->b; - if (btree_node_fake(b)) - continue; + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + } else { + bch2_btree_root_alloc_fake(c, i, 1); + ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); + if (ret) + break; + } + + bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + reconstructed_root = true; + } + + struct btree *b = r->b; btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - ret = bch2_btree_repair_topology_recurse(trans, b); + ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch_err(c, "empty btree root - repair unimplemented"); - ret = -BCH_ERR_fsck_repair_unimplemented; + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + r->b = NULL; + + if (!reconstructed_root) + goto reconstruct_root; + + bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch2_btree_root_alloc_fake(c, i, 0); + r->alive = false; + ret = 0; } } - +fsck_err: bch2_trans_put(trans); - return ret; } @@ -931,9 +1022,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans, b = bch2_btree_id_root(c, btree_id)->b; - if (btree_node_fake(b)) - return 0; - six_lock_read(&b->c.lock, NULL, NULL); printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 9c71e6fb9c41..d7de82ac3893 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1264,10 +1264,12 @@ out: return retry_read; fsck_err: if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) + ret == -BCH_ERR_btree_node_read_err_must_retry) { retry_read = 1; - else + } else { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); + } goto out; } @@ -1328,6 +1330,7 @@ start: if (!can_retry) { set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); break; } } @@ -1527,9 +1530,10 @@ fsck_err: ret = -1; } - if (ret) + if (ret) { set_btree_node_read_error(b); - else if (*saw_error) + bch2_btree_lost_data(c, b->c.btree_id); + } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { @@ -1665,6 +1669,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_fatal_error(c); set_btree_node_read_error(b); + bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); printbuf_exit(&buf); @@ -1861,7 +1866,7 @@ static void btree_node_write_work(struct work_struct *work) } else { ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw, diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 1f588264575d..5cbcbfe85235 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -567,3 +567,22 @@ int bch2_journal_keys_sort(struct bch_fs *c) bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 09cd53091a05..af25046ebcaa 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -66,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *); int bch2_journal_keys_sort(struct bch_fs *); +void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + unsigned, unsigned, + struct bpos, struct bpos); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c new file mode 100644 index 000000000000..3f33be7e5e5c --- /dev/null +++ b/fs/bcachefs/btree_node_scan.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_journal_iter.h" +#include "btree_node_scan.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "recovery_passes.h" + +#include +#include + +struct find_btree_nodes_worker { + struct closure *cl; + struct find_btree_nodes *f; + struct bch_dev *ca; +}; + +static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) +{ + prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); + if (n->overwritten) + prt_str(out, " overwritten"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); + bch2_extent_ptr_to_text(out, c, n->ptrs + i); + } +} + +static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) +{ + printbuf_indent_add(out, 2); + darray_for_each(nodes, i) { + found_btree_node_to_text(out, c, i); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) +{ + struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); + + set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); + bp->k.p = f->max_key; + bp->v.seq = cpu_to_le64(f->cookie); + bp->v.sectors_written = 0; + bp->v.flags = 0; + bp->v.min_key = f->min_key; + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); +} + +static bool found_btree_node_is_readable(struct btree_trans *trans, + const struct found_btree_node *f) +{ + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; + + found_btree_node_to_key(&k.k, f); + + struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); + bool ret = !IS_ERR_OR_NULL(b); + if (ret) + six_unlock_read(&b->c.lock); + + /* + * We might update this node's range; if that happens, we need the node + * to be re-read so the read path can trim keys that are no longer in + * this node + */ + if (b != btree_node_root(trans->c, b)) + bch2_btree_node_evict(trans, &k.k); + return ret; +} + +static int found_btree_node_cmp_cookie(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + cmp_int(l->cookie, r->cookie); +} + +/* + * Given two found btree nodes, if their sequence numbers are equal, take the + * one that's readable: + */ +static int found_btree_node_cmp_time(const struct found_btree_node *l, + const struct found_btree_node *r) +{ + return cmp_int(l->seq, r->seq); +} + +static int found_btree_node_cmp_pos(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->min_key, r->min_key) ?: + -found_btree_node_cmp_time(l, r); +} + +static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, bn, PAGE_SIZE); + + submit_bio_wait(bio); + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status))) + return; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return; + + rcu_read_lock(); + struct found_btree_node n = { + .btree_id = BTREE_NODE_ID(bn), + .level = BTREE_NODE_LEVEL(bn), + .seq = BTREE_NODE_SEQ(bn), + .cookie = le64_to_cpu(bn->keys.seq), + .min_key = bn->min_key, + .max_key = bn->max_key, + .nr_ptrs = 1, + .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, + .ptrs[0].offset = offset, + .ptrs[0].dev = ca->dev_idx, + .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + }; + rcu_read_unlock(); + + if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { + mutex_lock(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; + goto unlock; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +unlock: + mutex_unlock(&f->lock); + } +} + +static int read_btree_nodes_worker(void *p) +{ + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; + void *buf = (void *) __get_free_page(GFP_KERNEL); + struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; + + if (!buf || !bio) { + bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); + w->f->ret = -ENOMEM; + goto err; + } + + for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) + for (unsigned bucket_offset = 0; + bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; + bucket_offset += btree_sectors(c)) { + if (time_after(jiffies, last_print + HZ * 30)) { + u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; + u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; + + bch_info(ca, "%s: %2u%% done", __func__, + (unsigned) div64_u64(cur_sector * 100, end_sector)); + last_print = jiffies; + } + + try_read_btree_node(w->f, ca, bio, buf, + bucket * ca->mi.bucket_size + bucket_offset); + } +err: + bio_put(bio); + free_page((unsigned long) buf); + percpu_ref_get(&ca->io_ref); + closure_put(w->cl); + kfree(w); + return 0; +} + +static int read_btree_nodes(struct find_btree_nodes *f) +{ + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + for_each_online_member(c, ca) { + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + struct task_struct *t; + + if (!w) { + percpu_ref_put(&ca->io_ref); + ret = -ENOMEM; + goto err; + } + + percpu_ref_get(&ca->io_ref); + closure_get(&cl); + w->cl = &cl; + w->f = f; + w->ca = ca; + + t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = IS_ERR_OR_NULL(t); + if (ret) { + percpu_ref_put(&ca->io_ref); + closure_put(&cl); + f->ret = ret; + bch_err(c, "error starting kthread: %i", ret); + break; + } + } +err: + closure_sync(&cl); + return f->ret ?: ret; +} + +static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +{ + while (n + 1 < end && + found_btree_node_cmp_pos(n, n + 1) > 0) { + swap(n[0], n[1]); + n++; + } +} + +static int handle_overwrites(struct bch_fs *c, + struct found_btree_node *start, + struct found_btree_node *end) +{ + struct found_btree_node *n; +again: + for (n = start + 1; + n < end && + n->btree_id == start->btree_id && + n->level == start->level && + bpos_lt(n->min_key, start->max_key); + n++) { + int cmp = found_btree_node_cmp_time(start, n); + + if (cmp > 0) { + if (bpos_cmp(start->max_key, n->max_key) >= 0) + n->overwritten = true; + else { + n->range_updated = true; + n->min_key = bpos_successor(start->max_key); + n->range_updated = true; + bubble_up(n, end); + goto again; + } + } else if (cmp < 0) { + BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + + start->max_key = bpos_predecessor(n->min_key); + start->range_updated = true; + } else { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); + found_btree_node_to_text(&buf, c, start); + prt_str(&buf, "\n "); + found_btree_node_to_text(&buf, c, n); + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return -1; + } + } + + return 0; +} + +int bch2_scan_for_btree_nodes(struct bch_fs *c) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; + size_t dst; + int ret = 0; + + if (f->nodes.nr) + return 0; + + mutex_init(&f->lock); + + ret = read_btree_nodes(f); + if (ret) + return ret; + + if (!f->nodes.nr) { + bch_err(c, "%s: no btree nodes found", __func__); + ret = -EINVAL; + goto err; + } + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + + dst = 0; + darray_for_each(f->nodes, i) { + struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; + + if (prev && + prev->cookie == i->cookie) { + if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { + bch_err(c, "%s: found too many replicas for btree node", __func__); + ret = -EINVAL; + goto err; + } + prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; + } else { + f->nodes.data[dst++] = *i; + } + } + f->nodes.nr = dst; + + sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + + if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + dst = 0; + darray_for_each(f->nodes, i) { + if (i->overwritten) + continue; + + ret = handle_overwrites(c, i, &darray_top(f->nodes)); + if (ret) + goto err; + + BUG_ON(i->overwritten); + f->nodes.data[dst++] = *i; + } + f->nodes.nr = dst; + + if (c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); +err: + printbuf_exit(&buf); + return ret; +} + +static int found_btree_node_range_start_cmp(const void *_l, const void *_r) +{ + const struct found_btree_node *l = _l; + const struct found_btree_node *r = _r; + + return cmp_int(l->btree_id, r->btree_id) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->max_key, r->min_key); +} + +#define for_each_found_btree_node_in_range(_f, _search, _idx) \ + for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ + sizeof((_f)->nodes.data[0]), \ + found_btree_node_range_start_cmp, &search); \ + _idx < (_f)->nodes.nr && \ + (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ + (_f)->nodes.data[_idx].level == _search.level && \ + bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ + _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) + +bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + struct found_btree_node search = { + .btree_id = b->c.btree_id, + .level = b->c.level, + .min_key = b->data->min_key, + .max_key = b->key.k.p, + }; + + for_each_found_btree_node_in_range(f, search, idx) + if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) + return true; + return false; +} + +bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) +{ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, + .min_key = POS_MIN, + .max_key = SPOS_MAX, + }; + + for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) + return true; + return false; +} + +int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos node_min, struct bpos node_max) +{ + struct find_btree_nodes *f = &c->found_btree_nodes; + + int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); + printbuf_exit(&buf); + } + + struct found_btree_node search = { + .btree_id = btree, + .level = level, + .min_key = node_min, + .max_key = node_max, + }; + + for_each_found_btree_node_in_range(f, search, idx) { + struct found_btree_node n = f->nodes.data[idx]; + + n.range_updated |= bpos_lt(n.min_key, node_min); + n.min_key = bpos_max(n.min_key, node_min); + + n.range_updated |= bpos_gt(n.max_key, node_max); + n.max_key = bpos_min(n.max_key, node_max); + + struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; + + found_btree_node_to_key(&tmp.k, &n); + + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + + BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) + return ret; + } + + return 0; +} + +void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) +{ + darray_exit(&f->nodes); +} diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h new file mode 100644 index 000000000000..08687b209787 --- /dev/null +++ b/fs/bcachefs/btree_node_scan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_H +#define _BCACHEFS_BTREE_NODE_SCAN_H + +int bch2_scan_for_btree_nodes(struct bch_fs *); +bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); +int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); +void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */ diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h new file mode 100644 index 000000000000..abb7b27d556a --- /dev/null +++ b/fs/bcachefs/btree_node_scan_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H +#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H + +#include "darray.h" + +struct found_btree_node { + bool range_updated:1; + bool overwritten:1; + u8 btree_id; + u8 level; + u32 seq; + u64 cookie; + + struct bpos min_key; + struct bpos max_key; + + unsigned nr_ptrs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; +}; + +typedef DARRAY(struct found_btree_node) found_btree_nodes; + +struct find_btree_nodes { + int ret; + struct mutex lock; + found_btree_nodes nodes; +}; + +#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 96669fede7d3..aa9da4970740 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -887,6 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; switch (ret) { case -BCH_ERR_btree_insert_btree_node_full: @@ -905,7 +906,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, * flag */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 983bb27298cc..32397b99752f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -26,6 +26,13 @@ #include +const char * const bch2_btree_update_modes[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -303,7 +310,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct open_buckets obs = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim ? BTREE_NODE_RESERVE : 0; int ret; @@ -687,7 +694,7 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(trans, &as->disk_res, &journal_seq, - BCH_WATERMARK_reclaim| + BCH_WATERMARK_interior_updates| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_journal_reclaim, @@ -846,11 +853,11 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(!btree_node_dirty(b)); BUG_ON(!b->c.level); - as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->mode = BTREE_UPDATE_node; as->b = b; set_btree_node_write_blocked(b); @@ -873,7 +880,7 @@ static void btree_update_reparent(struct btree_update *as, lockdep_assert_held(&c->btree_interior_update_lock); child->b = NULL; - child->mode = BTREE_INTERIOR_UPDATING_AS; + child->mode = BTREE_UPDATE_update; bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, bch2_update_reparent_journal_pin_flush); @@ -884,7 +891,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) struct bkey_i *insert = &b->key; struct bch_fs *c = as->c; - BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode != BTREE_UPDATE_none); BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ARRAY_SIZE(as->journal_entries)); @@ -898,7 +905,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) mutex_lock(&c->btree_interior_update_lock); list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); - as->mode = BTREE_INTERIOR_UPDATING_ROOT; + as->mode = BTREE_UPDATE_root; mutex_unlock(&c->btree_interior_update_lock); } @@ -1076,7 +1083,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * struct bch_fs *c = as->c; u64 start_time = as->start_time; - BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + BUG_ON(as->mode == BTREE_UPDATE_none); if (as->took_gc_lock) up_read(&as->c->gc_lock); @@ -1121,7 +1128,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK; if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) + watermark < BCH_WATERMARK_reclaim) journal_flags |= JOURNAL_RES_GET_NONBLOCK; ret = drop_locks_do(trans, @@ -1172,7 +1179,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, as->c = c; as->start_time = start_time; as->ip_started = _RET_IP_; - as->mode = BTREE_INTERIOR_NO_UPDATE; + as->mode = BTREE_UPDATE_none; + as->watermark = watermark; as->took_gc_lock = true; as->btree_id = path->btree_id; as->update_level = update_level; @@ -1217,7 +1225,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, */ if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && - watermark != BCH_WATERMARK_reclaim) { + watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } @@ -2458,7 +2466,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) { struct bch_fs *c = trans->c; struct closure cl; @@ -2477,7 +2485,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) set_btree_node_fake(b); set_btree_node_need_rewrite(b); - b->c.level = 0; + b->c.level = level; b->c.btree_id = id; bkey_btree_ptr_init(&b->key); @@ -2504,9 +2512,21 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) return 0; } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) { - bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); + bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); +} + +static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +{ + prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + (void *) as->ip_started, + bch2_btree_id_str(as->btree_id), + bch2_watermarks[as->watermark], + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2515,12 +2535,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - prt_printf(out, "%ps: mode=%u nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - (void *) as->ip_started, - as->mode, - as->nodes_written, - closure_nr_remaining(&as->cl), - as->journal.seq); + bch2_btree_update_to_text(out, as); mutex_unlock(&c->btree_interior_update_lock); } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7ac3e17b84fd..88dcf5a22a3b 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -12,6 +12,18 @@ int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); +#define BTREE_UPDATE_MODES() \ + x(none) \ + x(node) \ + x(root) \ + x(update) + +enum btree_update_mode { +#define x(n) BTREE_UPDATE_##n, + BTREE_UPDATE_MODES() +#undef x +}; + /* * Tracks an in progress split/rewrite of a btree node and the update to the * parent node: @@ -39,14 +51,8 @@ struct btree_update { struct list_head list; struct list_head unwritten_list; - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - + enum btree_update_mode mode; + enum bch_watermark watermark; unsigned nodes_written:1; unsigned took_gc_lock:1; @@ -56,7 +62,7 @@ struct btree_update { struct disk_reservation disk_res; /* - * BTREE_INTERIOR_UPDATING_NODE: + * BTREE_UPDATE_node: * The update that made the new nodes visible was a regular update to an * existing interior node - @b. We can't write out the update to @b * until the new nodes we created are finished writing, so we block @b @@ -165,7 +171,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, unsigned, bool); void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); +void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); static inline unsigned btree_update_reserve_required(struct bch_fs *c, struct btree *b) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6387e039f789..00aaf4bb5139 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma fallthrough; case BCH_WATERMARK_btree_copygc: case BCH_WATERMARK_reclaim: + case BCH_WATERMARK_interior_updates: break; } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b564404dad71..34731ee0217f 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -580,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - (!atomic_read(&ctxt->read_sectors) && - !atomic_read(&ctxt->write_sectors))); + list_empty(&ctxt->ios)); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index b2432d88cda6..0e3ca99fbd2d 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,6 +978,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return bkey_deleted(k.k); } +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -993,31 +1018,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " "); switch (__extent_entry_type(entry)) { - case BCH_EXTENT_ENTRY_ptr: { - const struct bch_extent_ptr *ptr = entry_to_ptr(entry); - struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; - - if (!ca) { - prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : ""); - } else { - u32 offset; - u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); - - prt_printf(out, "ptr: %u:%llu:%u gen %u", - ptr->dev, b, offset, ptr->gen); - if (ptr->cached) - prt_str(out, " cached"); - if (ptr->unwritten) - prt_str(out, " unwritten"); - if (ca && ptr_stale(ca, ptr)) - prt_printf(out, " stale"); - } + case BCH_EXTENT_ENTRY_ptr: + bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break; - } + case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 3fd0169b98c1..528e817eacbd 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -676,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c new file mode 100644 index 000000000000..4ce5e957a6e9 --- /dev/null +++ b/fs/bcachefs/eytzinger.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "eytzinger.h" + +/** + * is_aligned - is this pointer & size okay for word-wide copying? + * @base: pointer to data + * @size: size of each element + * @align: required alignment (typically 4 or 8) + * + * Returns true if elements can be copied using word loads and stores. + * The size must be a multiple of the alignment, and the base address must + * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. + * + * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" + * to "if ((a | b) & mask)", so we do that by hand. + */ +__attribute_const__ __always_inline +static bool is_aligned(const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char)size; + + (void)base; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + lsbits |= (unsigned char)(uintptr_t)base; +#endif + return (lsbits & (align - 1)) == 0; +} + +/** + * swap_words_32 - swap two elements in 32-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 4) + * + * Exchange the two objects in memory. This exploits base+index addressing, + * which basically all CPUs have, to minimize loop overhead computations. + * + * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the + * bottom of the loop, even though the zero flag is still valid from the + * subtract (since the intervening mov instructions don't alter the flags). + * Gcc 8.1.0 doesn't have that problem. + */ +static void swap_words_32(void *a, void *b, size_t n) +{ + do { + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + } while (n); +} + +/** + * swap_words_64 - swap two elements in 64-bit chunks + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size (must be a multiple of 8) + * + * Exchange the two objects in memory. This exploits base+index + * addressing, which basically all CPUs have, to minimize loop overhead + * computations. + * + * We'd like to use 64-bit loads if possible. If they're not, emulating + * one requires base+index+4 addressing which x86 has but most other + * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, + * but it's possible to have 64-bit loads without 64-bit pointers (e.g. + * x32 ABI). Are there any cases the kernel needs to worry about? + */ +static void swap_words_64(void *a, void *b, size_t n) +{ + do { +#ifdef CONFIG_64BIT + u64 t = *(u64 *)(a + (n -= 8)); + *(u64 *)(a + n) = *(u64 *)(b + n); + *(u64 *)(b + n) = t; +#else + /* Use two 32-bit transfers to avoid base+index+4 addressing */ + u32 t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; + + t = *(u32 *)(a + (n -= 4)); + *(u32 *)(a + n) = *(u32 *)(b + n); + *(u32 *)(b + n) = t; +#endif + } while (n); +} + +/** + * swap_bytes - swap two elements a byte at a time + * @a: pointer to the first element to swap + * @b: pointer to the second element to swap + * @n: element size + * + * This is the fallback if alignment doesn't allow using larger chunks. + */ +static void swap_bytes(void *a, void *b, size_t n) +{ + do { + char t = ((char *)a)[--n]; + ((char *)a)[n] = ((char *)b)[n]; + ((char *)b)[n] = t; + } while (n); +} + +/* + * The values are arbitrary as long as they can't be confused with + * a pointer, but small integers make for the smallest compare + * instructions. + */ +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap; +}; + +/* + * The function pointer is last to make tail calls most efficient if the + * compiler decides not to inline this function. + */ +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) +{ + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap(a, b, (int)size); + return; + } + + if (swap_func == SWAP_WORDS_64) + swap_words_64(a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32(a, b, size); + else if (swap_func == SWAP_BYTES) + swap_bytes(a, b, size); + else + swap_func(a, b, (int)size, priv); +} + +#define _CMP_WRAPPER ((cmp_r_func_t)0L) + +static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) +{ + if (cmp == _CMP_WRAPPER) + return ((const struct wrapper *)priv)->cmp(a, b); + return cmp(a, b, priv); +} + +static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, const void *priv, + size_t l, size_t r) +{ + return do_cmp(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + cmp_func, priv); +} + +static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, + swap_r_func_t swap_func, const void *priv, + size_t l, size_t r) +{ + do_swap(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size, swap_func, priv); +} + +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + int i, c, r; + + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + + if (!swap_func) { + if (is_aligned(base, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned(base, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0) + c++; + + if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0) + break; + + eytzinger0_do_swap(base, n, size, swap_func, priv, r, c); + } + } +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + cmp_func_t cmp_func, + swap_func_t swap_func) +{ + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); +} diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index b04750dbf870..ee0e2df33322 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -5,23 +5,33 @@ #include #include -#include "util.h" +#ifdef EYTZINGER_DEBUG +#define EYTZINGER_BUG_ON(cond) BUG_ON(cond) +#else +#define EYTZINGER_BUG_ON(cond) +#endif /* * Traversal for trees in eytzinger layout - a full binary tree layed out in an - * array - */ - -/* - * One based indexing version: + * array. * - * With one based indexing each level of the tree starts at a power of two - - * good for cacheline alignment: + * Consider using an eytzinger tree any time you would otherwise be doing binary + * search over an array. Binary search is a worst case scenario for branch + * prediction and prefetching, but in an eytzinger tree every node's children + * are adjacent in memory, thus we can prefetch children before knowing the + * result of the comparison, assuming multiple nodes fit on a cacheline. + * + * Two variants are provided, for one based indexing and zero based indexing. + * + * Zero based indexing is more convenient, but one based indexing has better + * alignment and thus better performance because each new level of the tree + * starts at a power of two, and thus if element 0 was cacheline aligned, each + * new level will be as well. */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + child; } @@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); @@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i > size); + EYTZINGER_BUG_ON(i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; @@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i > size); + EYTZINGER_BUG_ON(!i || i > size); /* * sign bit trick: @@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) static inline unsigned eytzinger0_child(unsigned i, unsigned child) { - EBUG_ON(child > 1); + EYTZINGER_BUG_ON(child > 1); return (i << 1) + 1 + child; } @@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) -typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); - /* return greatest node <= @search, or -1 if not found */ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, const void *search) + cmp_func_t cmp, const void *search) { unsigned i, n = 0; @@ -244,21 +252,24 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, do { i = n; - n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); } while (n < nr); if (n & 1) { /* @i was greater than @search, return previous node: */ - - if (i == eytzinger0_first(nr)) - return -1; - return eytzinger0_prev(i, nr); } else { return i; } } +static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size, + cmp_func_t cmp, const void *search) +{ + ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + return eytzinger0_next(idx, size); +} + #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ void *_base = (base); \ @@ -269,13 +280,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, int _res; \ \ while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size, _size))) \ + (_res = _cmp(_search, _base + _i * _size))) \ _i = eytzinger0_child(_i, _res > 0); \ _i; \ }) -void eytzinger0_sort(void *, size_t, size_t, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); +void eytzinger0_sort_r(void *, size_t, size_t, + cmp_r_func_t, swap_r_func_t, const void *); +void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t); #endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index cbb8b43e419f..8e2010212cc3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret; - - ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -170,7 +168,8 @@ err: /* Get lost+found, create if it doesn't exist: */ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, - struct bch_inode_unpacked *lostfound) + struct bch_inode_unpacked *lostfound, + u64 reattaching_inum) { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); @@ -185,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, return ret; subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; - u32 subvol_snapshot; - ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol), - &subvol_snapshot, &root_inum.inum); - bch_err_msg(c, ret, "looking up root subvol"); + struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), + false, 0, &subvol); + bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", + le32_to_cpu(st.master_subvol), snapshot); if (ret) return ret; + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); + bch2_trans_iter_exit(trans, &iter); + } + + root_inum.inum = le64_to_cpu(subvol.inode); + struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; u32 root_inode_snapshot = snapshot; ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot); - bch_err_msg(c, ret, "looking up root inode"); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, le32_to_cpu(st.master_subvol)); if (ret) return ret; @@ -293,7 +309,7 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); } - ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum); if (ret) return ret; @@ -364,6 +380,112 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume return ret; } +static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum) +{ + struct bch_fs *c = trans->c; + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); + return -BCH_ERR_fsck_repair_unimplemented; + } + + /* + * If inum isn't set, that means we're being called from check_dirents, + * not check_inodes - the root of this subvolume doesn't exist or we + * would have found it there: + */ + if (!inum) { + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked new_inode; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: + bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) + return ret; + + inum = new_inode.bi_inum; + } + + bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum); + + struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + int ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + return ret; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->k.p.offset = subvolid; + new_subvol->v.snapshot = cpu_to_le32(snapshotid); + new_subvol->v.inode = cpu_to_le64(inum); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0); + if (ret) + return ret; + + struct btree_iter iter; + struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + bch_err_msg(c, ret, "getting snapshot %u", snapshotid); + if (ret) + return ret; + + u32 snapshot_tree = le32_to_cpu(s->v.tree); + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); + bch2_trans_iter_exit(trans, &iter); + + struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); + bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree); + if (ret) + return ret; + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked new_inode; + + bch2_inode_init_early(c, &new_inode); + bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL); + new_inode.bi_size = size; + new_inode.bi_inum = inum; + + return __bch2_fsck_write_inode(trans, &new_inode, snapshot); +} + +static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum) +{ + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); + struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) + return ret; + + return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG); +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -1065,6 +1187,11 @@ static int check_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum); + goto do_update; + } + if (fsck_err_on(ret, c, inode_bi_subvol_missing, "inode %llu:%u bi_subvol points to missing subvolume %u", @@ -1082,7 +1209,7 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } - +do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); @@ -1131,8 +1258,8 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal i->count = count2; if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); return -BCH_ERR_internal_fsck_err; } @@ -1435,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; if (k.k->type != KEY_TYPE_whiteout) { + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + inode->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, extent_in_missing_inode, "extent in missing inode:\n %s", (printbuf_reset(&buf), @@ -1587,8 +1725,8 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return count2; if (i->count != count2) { - bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", - i->count, count2); + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1785,6 +1923,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); u32 parent_snapshot; + u32 new_parent_subvol = 0; u64 parent_inum; struct printbuf buf = PRINTBUF; int ret = 0; @@ -1793,6 +1932,27 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if (ret || + (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) { + int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret2 && !bch2_err_matches(ret, ENOENT)) + return ret2; + } + + if (ret && + !new_parent_subvol && + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) { + /* + * Couldn't find a subvol for dirent's snapshot - but we lost + * subvols, so we need to reconstruct: + */ + ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0); + if (ret) + return ret; + + parent_snapshot = d.k->p.snapshot; + } + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, "dirent parent_subvol points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || @@ -1801,10 +1961,10 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", parent_snapshot, (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - u32 new_parent_subvol; - ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); - if (ret) - goto err; + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); + return -BCH_ERR_fsck_repair_unimplemented; + } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); ret = PTR_ERR_OR_ZERO(new_dirent); @@ -1850,9 +2010,16 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; - if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + ret = 0; + goto err; + } + + if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol, c, inode_bi_parent_wrong, "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, @@ -1860,13 +2027,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * subvol_root.bi_parent_subvol = parent_subvol; ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; } ret = check_dirent_target(trans, iter, d, &subvol_root, target_snapshot); if (ret) - return ret; + goto err; out: err: fsck_err: @@ -1883,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct snapshots_seen *s) { struct bch_fs *c = trans->c; - struct bkey_s_c_dirent d; struct inode_walker_entry *i; struct printbuf buf = PRINTBUF; struct bpos equiv; @@ -1922,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); dir->first_this_inode = false; + if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + ret = reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + + dir->last_pos.inode--; + ret = -BCH_ERR_transaction_restart_nested; + goto err; + } + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), @@ -1956,7 +2133,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type != KEY_TYPE_dirent) goto out; - d = bkey_s_c_to_dirent(k); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index b5303874fc35..37a024e034d4 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -95,8 +95,7 @@ out: return ret ?: bch2_blacklist_table_initialize(c); } -static int journal_seq_blacklist_table_cmp(const void *_l, - const void *_r, size_t size) +static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index db63b3f3b338..4c298e74723d 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -136,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_2(struct kunit *test) -{ - s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; - s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; - s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; - s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; - s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - /* Test behaviour where we switch from one steady state to another: */ -static void mean_and_variance_test_3(struct kunit *test) +static void mean_and_variance_test_2(struct kunit *test) { s64 d[] = { 100, 100, 100, 100, 100 }; s64 mean[] = { 22, 32, 40, 46, 50 }; @@ -161,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test) d, mean, stddev, weighted_mean, weighted_stddev); } -static void mean_and_variance_test_4(struct kunit *test) -{ - s64 d[] = { 100, 100, 100, 100, 100 }; - s64 mean[] = { 10, 11, 12, 13, 14 }; - s64 stddev[] = { 9, 13, 15, 17, 19 }; - s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; - s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; - - do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, - d, mean, stddev, weighted_mean, weighted_stddev); -} - static void mean_and_variance_fast_divpow2(struct kunit *test) { s64 i; @@ -230,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = { KUNIT_CASE(mean_and_variance_weighted_advanced_test), KUNIT_CASE(mean_and_variance_test_1), KUNIT_CASE(mean_and_variance_test_2), - KUNIT_CASE(mean_and_variance_test_3), - KUNIT_CASE(mean_and_variance_test_4), {} }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 084247c13bd8..1ac4135cca1c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -368,11 +368,11 @@ enum fsck_err_opts { OPT_STR_NOLIMIT(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Exit recovery after specified pass") \ - x(keep_journal, u8, \ + x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ - NULL, "Don't free journal entries/keys after startup")\ + NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e8b434009293..b76c16152579 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "bkey_buf.h" #include "btree_journal_iter.h" +#include "btree_node_scan.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -32,6 +33,20 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } +void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +{ + u64 b = BIT_ULL(btree); + + if (!(c->sb.btrees_lost_data & b)) { + bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + + mutex_lock(&c->sb_lock); + bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } +} + static bool btree_id_is_alloc(enum btree_id id) { switch (id) { @@ -47,7 +62,7 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void do_reconstruct_alloc(struct bch_fs *c) +static void bch2_reconstruct_alloc(struct bch_fs *c) { bch2_journal_log_msg(c, "dropping alloc info"); bch_info(c, "dropping and reconstructing all alloc info"); @@ -82,15 +97,17 @@ static void do_reconstruct_alloc(struct bch_fs *c) c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); - struct journal_keys *keys = &c->journal_keys; - size_t src, dst; - move_gap(keys, keys->nr); - - for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->data[src].btree_id)) - keys->data[dst++] = keys->data[src]; - keys->nr = keys->gap = dst; + bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, + 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* @@ -269,7 +286,7 @@ int bch2_journal_replay(struct bch_fs *c) bch2_trans_put(trans); trans = NULL; - if (!c->opts.keep_journal && + if (!c->opts.retain_recovery_info && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); @@ -433,10 +450,9 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { - unsigned i; int ret = 0; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) @@ -445,36 +461,40 @@ static int read_btree_roots(struct bch_fs *c) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - if (r->error) { - __fsck_err(c, - btree_id_is_alloc(i) - ? FSCK_CAN_IGNORE : 0, - btree_root_bkey_invalid, - "invalid btree root %s", - bch2_btree_id_str(i)); - if (i == BTREE_ID_alloc) + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", + bch2_btree_id_str(i)) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, + "error reading btree root %s l=%u: %s", + bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) { + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - } + r->error = 0; + } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + bch_info(c, "will run btree node scan"); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + } - ret = bch2_btree_root_read(c, i, &r->key, r->level); - if (ret) { - fsck_err(c, - btree_root_read_error, - "error reading btree root %s", - bch2_btree_id_str(i)); - if (btree_id_is_alloc(i)) - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ret = 0; + bch2_btree_lost_data(c, i); } } - for (i = 0; i < BTREE_ID_NR; i++) { + for (unsigned i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = bch2_btree_id_root(c, i); - if (!r->b) { + if (!r->b && !r->error) { r->alive = false; r->level = 0; - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); } } fsck_err: @@ -651,7 +671,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { struct genradix_iter iter; struct journal_replay **i; @@ -731,7 +751,7 @@ use_clean: c->journal_replay_seq_end = blacklist_seq - 1; if (c->opts.reconstruct_alloc) - do_reconstruct_alloc(c); + bch2_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -838,13 +858,19 @@ use_clean: } if (!test_bit(BCH_FS_error, &c->flags) && - (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) || - !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) { - memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); write_sb = true; } + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && + c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; + } + if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { @@ -883,9 +909,10 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.keep_journal && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + if (!c->opts.retain_recovery_info) { bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + } kfree(clean); if (!ret && @@ -911,6 +938,7 @@ int bch2_fs_initialize(struct bch_fs *c) int ret; bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); mutex_lock(&c->sb_lock); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); @@ -929,7 +957,7 @@ int bch2_fs_initialize(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); for (unsigned i = 0; i < BTREE_ID_NR; i++) - bch2_btree_root_alloc(c, i); + bch2_btree_root_alloc_fake(c, i, 0); for_each_member_device(c, ca) bch2_dev_usage_init(ca); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 3962fd87b50d..4bf818de1f2f 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H +void bch2_btree_lost_data(struct bch_fs *, enum btree_id); + int bch2_journal_replay(struct bch_fs *); int bch2_fs_recovery(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0089d9456b10..cb501460d615 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -4,6 +4,7 @@ #include "alloc_background.h" #include "backpointers.h" #include "btree_gc.h" +#include "btree_node_scan.h" #include "ec.h" #include "fsck.h" #include "inode.h" @@ -59,18 +60,23 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -u64 bch2_recovery_passes_to_stable(u64 v) -{ - static const u8 map[] = { +static const u8 passes_to_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, BCH_RECOVERY_PASSES() #undef x - }; +}; +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); + ret |= BIT_ULL(passes_to_stable_map[i]); return ret; } @@ -116,13 +122,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { - __le64 s = cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(pass))); + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - if (!(ext->recovery_passes_required[0] & s)) { - ext->recovery_passes_required[0] |= s; + if (!test_bit_le64(s, ext->recovery_passes_required)) { + __set_bit_le64(s, ext->recovery_passes_required); bch2_write_super(c); } mutex_unlock(&c->sb_lock); @@ -130,6 +136,21 @@ int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, return bch2_run_explicit_recovery_pass(c, pass); } +static void bch2_clear_recovery_pass_required(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (test_bit_le64(s, ext->recovery_passes_required)) { + __clear_bit_le64(s, ext->recovery_passes_required); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + u64 bch2_fsck_recovery_passes(void) { u64 ret = 0; @@ -218,6 +239,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); + c->curr_recovery_pass++; } diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index f30521285706..773aea9a0080 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(alloc_read, 0, PASS_ALWAYS) \ x(stripes_read, 1, PASS_ALWAYS) \ @@ -31,6 +32,7 @@ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index cc2672c12031..678b9c20e251 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -6,12 +6,15 @@ #include "replicas.h" #include "super-io.h" +#include + static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ -static int bch2_memcmp(const void *l, const void *r, size_t size) +static int bch2_memcmp(const void *l, const void *r, const void *priv) { + size_t size = (size_t) priv; return memcmp(l, r, size); } @@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { - eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL); + eytzinger0_sort_r(r->entries, r->nr, r->entry_size, + bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, @@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, verify_replicas_entry(search); -#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) +#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp @@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, { unsigned i; - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - bch2_memcmp, NULL); + sort_r(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + bch2_memcmp, NULL, + (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 73e9634df8ff..d7d609131030 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -267,7 +267,10 @@ x(subvol_unreachable, 259) \ x(btree_node_bkey_bad_u64s, 260) \ x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 4577ee7939a2..0e806f04f3d7 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -8,6 +8,7 @@ #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include @@ -574,6 +575,13 @@ static int check_snapshot_tree(struct btree_trans *trans, u32 subvol_id; ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + + if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ + ret = 0; + goto err; + } + if (ret) goto err; @@ -731,7 +739,6 @@ static int check_snapshot(struct btree_trans *trans, u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); u32 real_depth; struct printbuf buf = PRINTBUF; - bool should_have_subvol; u32 i, id; int ret = 0; @@ -777,7 +784,7 @@ static int check_snapshot(struct btree_trans *trans, } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { @@ -879,6 +886,154 @@ int bch2_check_snapshots(struct bch_fs *c) return ret; } +static int check_snapshot_exists(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + + if (bch2_snapshot_equiv(c, id)) + return 0; + + u32 tree_id; + int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id); + if (ret) + return ret; + + struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot)); + ret = PTR_ERR_OR_ZERO(snapshot); + if (ret) + return ret; + + bkey_snapshot_init(&snapshot->k_i); + snapshot->k.p = POS(0, id); + snapshot->v.tree = cpu_to_le32(tree_id); + snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: + bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); +} + +/* Figure out which snapshot nodes belong in the same tree: */ +struct snapshot_tree_reconstruct { + enum btree_id btree; + struct bpos cur_pos; + snapshot_id_list cur_ids; + DARRAY(snapshot_id_list) trees; +}; + +static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) +{ + darray_for_each(r->trees, i) + darray_exit(i); + darray_exit(&r->trees); + darray_exit(&r->cur_ids); +} + +static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + return r->btree == BTREE_ID_inodes + ? r->cur_pos.offset == pos.offset + : r->cur_pos.inode == pos.inode; +} + +static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) +{ + darray_for_each(*l, i) + if (snapshot_list_has_id(r, *i)) + return true; + return false; +} + +static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +{ + bool first = true; + darray_for_each(*s, i) { + if (!first) + prt_char(out, ' '); + first = false; + prt_printf(out, "%u", *i); + } +} + +static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) +{ + if (r->cur_ids.nr) { + darray_for_each(r->trees, i) + if (snapshot_id_lists_have_common(i, &r->cur_ids)) { + int ret = snapshot_list_merge(c, i, &r->cur_ids); + if (ret) + return ret; + goto out; + } + darray_push(&r->trees, r->cur_ids); + darray_init(&r->cur_ids); + } +out: + r->cur_ids.nr = 0; + return 0; +} + +static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) +{ + if (!same_snapshot(r, pos)) + snapshot_tree_reconstruct_next(c, r); + r->cur_pos = pos; + return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot); +} + +int bch2_reconstruct_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) + goto err; + + snapshot_tree_reconstruct_next(c, &r); + } + } + + darray_for_each(r.trees, t) { + printbuf_reset(&buf); + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { + if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + c, snapshot_node_missing, + "snapshot node %u from tree %s missing", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_snapshot_exists(trans, *id)); + if (ret) + goto err; + } + } + } +fsck_err: +err: + bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1689,6 +1844,20 @@ int bch2_snapshots_read(struct bch_fs *c) POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); bch_err_fn(c, ret); + + /* + * It's important that we check if we need to reconstruct snapshots + * before going RW, so we mark that pass as required in the superblock - + * otherwise, we could end up deleting keys with missing snapshot nodes + * instead + */ + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (bch2_err_matches(ret, EIO) || + (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) + ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); + return ret; } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 331f20fd8d03..b7d2fed37c4f 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -209,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) { - int ret; - BUG_ON(snapshot_list_has_id(s, id)); - ret = darray_push(s, id); + int ret = darray_push(s, id); if (ret) bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); return ret; } +static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret = snapshot_list_has_id(s, id) + ? 0 + : darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src) +{ + darray_for_each(*src, i) { + int ret = snapshot_list_add_nodup(c, dst, *i); + if (ret) + return ret; + } + + return 0; +} + int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s); int bch2_snapshot_get_subvol(struct btree_trans *, u32, @@ -229,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); +int bch2_reconstruct_snapshots(struct bch_fs *); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e15f8b1f30c2..e0aa3655b63b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -527,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c) memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); - if (ext) + if (ext) { le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } for_each_member_device(c, ca) { struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); @@ -1162,6 +1164,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb, kfree(errors_silent); } + + prt_printf(out, "Btrees with missing data:"); + prt_tab(out); + prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data)); + prt_newline(out); } static const struct bch_sb_field_ops bch_sb_field_ops_ext = { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index bc026a77eb99..ed63018f21be 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -15,6 +15,7 @@ #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" +#include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" @@ -536,6 +537,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); @@ -560,6 +562,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 216fadf16928..92c6ad75e702 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -707,149 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } -static int alignment_ok(const void *base, size_t align) -{ - return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || - ((unsigned long)base & (align - 1)) == 0; -} - -static void u32_swap(void *a, void *b, size_t size) -{ - u32 t = *(u32 *)a; - *(u32 *)a = *(u32 *)b; - *(u32 *)b = t; -} - -static void u64_swap(void *a, void *b, size_t size) -{ - u64 t = *(u64 *)a; - *(u64 *)a = *(u64 *)b; - *(u64 *)b = t; -} - -static void generic_swap(void *a, void *b, size_t size) -{ - char t; - - do { - t = *(char *)a; - *(char *)a++ = *(char *)b; - *(char *)b++ = t; - } while (--size > 0); -} - -static inline int do_cmp(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - size_t l, size_t r) -{ - return cmp_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -static inline void do_swap(void *base, size_t n, size_t size, - void (*swap_func)(void *, void *, size_t), - size_t l, size_t r) -{ - swap_func(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, - size); -} - -void eytzinger0_sort(void *base, size_t n, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)) -{ - int i, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } - - /* sort */ - for (i = n - 1; i > 0; --i) { - do_swap(base, n, size, swap_func, 0, i); - - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; - - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); - } - } -} - -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - - if (!swap_func) { - if (size == 4 && alignment_ok(base, 4)) - swap_func = u32_swap; - else if (size == 8 && alignment_ok(base, 8)) - swap_func = u64_swap; - else - swap_func = generic_swap; - } - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size, size) < 0) - c += size; - if (cmp_func(base + r, base + c, size) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 175aee3074c7..b7e7c29278fc 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -631,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes) memset(s + bytes, c, rem); } -void sort_cmp_size(void *base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *, size_t), - void (*swap_func)(void *, void *, size_t)); - /* just the memmove, doesn't update @_nr */ #define __array_insert_item(_array, _nr, _pos) \ memmove(&(_array)[(_pos) + 1], \ @@ -797,4 +793,14 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr) addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); } +static inline void __clear_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64)); +} + +static inline bool test_bit_le64(size_t bit, __le64 *addr) +{ + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; +} + #endif /* _BCACHEFS_UTIL_H */