diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 0cd868c8248b..ac2dddd90c31 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -23,7 +23,7 @@ static u64 last_unwritten_seq(struct journal *j) lockdep_assert_held(&j->lock); - return journal_cur_seq(j) - s.prev_buf_unwritten; + return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK); } static inline bool journal_seq_unwritten(struct journal *j, u64 seq) @@ -51,7 +51,7 @@ journal_seq_to_buf(struct journal *j, u64 seq) j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); if (journal_seq_unwritten(j, seq)) { - buf = j->buf + (seq & 1); + buf = j->buf + (seq & JOURNAL_BUF_MASK); EBUG_ON(le64_to_cpu(buf->data->seq) != seq); } return buf; @@ -108,15 +108,8 @@ void bch2_journal_halt(struct journal *j) /* journal entry close/open: */ -void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) +void __bch2_journal_buf_put(struct journal *j) { - if (!need_write_just_set && - test_bit(JOURNAL_NEED_WRITE, &j->flags)) - bch2_time_stats_update(j->delay_time, - j->need_write_time); - - clear_bit(JOURNAL_NEED_WRITE, &j->flags); - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } @@ -129,7 +122,6 @@ static bool __journal_entry_close(struct journal *j) struct journal_buf *buf = journal_cur_buf(j); union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - bool set_need_write = false; unsigned sectors; lockdep_assert_held(&j->lock); @@ -148,15 +140,13 @@ static bool __journal_entry_close(struct journal *j) if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { set_bit(JOURNAL_NEED_WRITE, &j->flags); j->need_write_time = local_clock(); - set_need_write = true; } - if (new.prev_buf_unwritten) - return false; - new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; new.idx++; - new.prev_buf_unwritten = 1; + + if (new.idx == new.unwritten_idx) + return false; BUG_ON(journal_state_count(new, new.idx)); } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -190,24 +180,44 @@ static bool __journal_entry_close(struct journal *j) */ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); cancel_delayed_work(&j->write_work); + clear_bit(JOURNAL_NEED_WRITE, &j->flags); bch2_journal_space_available(j); - bch2_journal_buf_put(j, old.idx, set_need_write); + bch2_journal_buf_put(j, old.idx); return true; } +static bool journal_entry_want_write(struct journal *j) +{ + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = false; + + /* + * Don't close it yet if we already have a write in flight, but do set + * NEED_WRITE: + */ + if (s.idx != s.unwritten_idx) + set_bit(JOURNAL_NEED_WRITE, &j->flags); + else + ret = __journal_entry_close(j); + + return ret; +} + static bool journal_entry_close(struct journal *j) { bool ret; spin_lock(&j->lock); - ret = __journal_entry_close(j); + ret = journal_entry_want_write(j); spin_unlock(&j->lock); return ret; @@ -289,8 +299,8 @@ static int journal_entry_open(struct journal *j) static bool journal_quiesced(struct journal *j) { - union journal_res_state state = READ_ONCE(j->reservations); - bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); + union journal_res_state s = READ_ONCE(j->reservations); + bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s); if (!ret) journal_entry_close(j); @@ -317,17 +327,29 @@ static void journal_write_work(struct work_struct *work) u64 bch2_inode_journal_seq(struct journal *j, u64 inode) { size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - u64 seq = 0; + union journal_res_state s; + unsigned i; + u64 seq; - if (!test_bit(h, j->buf[0].has_inode) && - !test_bit(h, j->buf[1].has_inode)) - return 0; spin_lock(&j->lock); - if (test_bit(h, journal_cur_buf(j)->has_inode)) - seq = journal_cur_seq(j); - else if (test_bit(h, journal_prev_buf(j)->has_inode)) - seq = journal_cur_seq(j) - 1; + seq = journal_cur_seq(j); + s = READ_ONCE(j->reservations); + i = s.idx; + + while (1) { + if (test_bit(h, j->buf[i].has_inode)) + goto out; + + if (i == s.unwritten_idx) + break; + + i = (i - 1) & JOURNAL_BUF_MASK; + seq--; + } + + seq = 0; +out: spin_unlock(&j->lock); return seq; @@ -574,7 +596,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, BUG(); if (seq == journal_cur_seq(j)) - __journal_entry_close(j); + journal_entry_want_write(j); out: spin_unlock(&j->lock); return ret; @@ -863,15 +885,18 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) { union journal_res_state state; - struct journal_buf *w; - bool ret; + bool ret = false; + unsigned i; spin_lock(&j->lock); state = READ_ONCE(j->reservations); - w = j->buf + !state.idx; + i = state.idx; - ret = state.prev_buf_unwritten && - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); + while (i != state.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx)) + ret = true; + } spin_unlock(&j->lock); return ret; @@ -957,7 +982,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, journal_pin_new_entry(j, 1); - j->reservations.idx = journal_cur_seq(j); + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); bch2_journal_buf_init(j); @@ -1015,8 +1040,10 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - kvpfree(j->buf[1].data, j->buf[1].buf_size); - kvpfree(j->buf[0].data, j->buf[0].buf_size); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); } @@ -1024,6 +1051,7 @@ int bch2_fs_journal_init(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; + unsigned i; int ret = 0; pr_verbose_init(c->opts, ""); @@ -1038,8 +1066,6 @@ int bch2_fs_journal_init(struct journal *j) lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; @@ -1051,13 +1077,20 @@ int bch2_fs_journal_init(struct journal *j) ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { ret = -ENOMEM; goto out; } + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) { + ret = -ENOMEM; + goto out; + } + } + j->pin.front = j->pin.back = 1; out: pr_verbose_init(c->opts, "ret %i", ret); @@ -1071,7 +1104,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; - unsigned iter; + unsigned i; rcu_read_lock(); spin_lock(&j->lock); @@ -1114,16 +1147,16 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } pr_buf(out, - "current entry refs:\t%u\n" - "prev entry unwritten:\t", - journal_state_count(s, s.idx)); + "current entry:\tidx %u refcount %u\n", + s.idx, journal_state_count(s, s.idx)); - if (s.prev_buf_unwritten) - pr_buf(out, "yes, ref %u sectors %u\n", - journal_state_count(s, !s.idx), - journal_prev_buf(j)->sectors); - else - pr_buf(out, "no\n"); + i = s.idx; + while (i != s.unwritten_idx) { + i = (i - 1) & JOURNAL_BUF_MASK; + + pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n", + i, journal_state_count(s, i), j->buf[i].sectors); + } pr_buf(out, "need write:\t\t%i\n" @@ -1131,7 +1164,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); - for_each_member_device_rcu(ca, c, iter, + for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -1146,7 +1179,7 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "\tdirty_idx_ondisk\t%u (seq %llu)\n" "\tdirty_idx\t\t%u (seq %llu)\n" "\tcur_idx\t\t%u (seq %llu)\n", - iter, ja->nr, + i, ja->nr, bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free, ja->discard_idx, diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 7ad2bb576eb0..1b6175cd6f1b 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -127,11 +127,6 @@ static inline struct journal_buf *journal_cur_buf(struct journal *j) return j->buf + j->reservations.idx; } -static inline struct journal_buf *journal_prev_buf(struct journal *j) -{ - return j->buf + !j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -151,13 +146,21 @@ void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) { - return idx == 0 ? s.buf0_count : s.buf1_count; + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; } static inline void bch2_journal_set_has_inode(struct journal *j, @@ -257,21 +260,24 @@ static inline bool journal_entry_empty(struct jset *j) return true; } -void __bch2_journal_buf_put(struct journal *, bool); +void __bch2_journal_buf_put(struct journal *); -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, }).v, &j->reservations.counter); - if (!journal_state_count(s, idx)) { - EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); - __bch2_journal_buf_put(j, need_write_just_set); - } + + EBUG_ON(((s.idx - idx) & 3) > + ((s.idx - s.unwritten_idx) & 3)); + + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); } /* @@ -291,7 +297,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, NULL, 0); - bch2_journal_buf_put(j, res->idx, false); + bch2_journal_buf_put(j, res->idx); res->ref = 0; } @@ -327,11 +333,18 @@ static inline int journal_res_get_fast(struct journal *j, !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) return 0; - if (flags & JOURNAL_RES_GET_CHECK) - return 1; - new.cur_entry_offset += res->u64s; journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index fc2fdcc2b627..1aeeb58d3c2a 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -950,16 +950,23 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) buf->buf_size = new_size; } +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + j->reservations.unwritten_idx; +} + static void journal_write_done(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_devs_list devs = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; + union journal_res_state old, new; u64 seq = le64_to_cpu(w->data->seq); u64 last_seq = le64_to_cpu(w->data->last_seq); + u64 v; int err = 0; bch2_time_stats_update(j->write_time, j->write_start_time); @@ -998,9 +1005,14 @@ static void journal_write_done(struct closure *cl) /* also must come before signalling write completion: */ closure_debug_destroy(cl); - BUG_ON(!j->reservations.prev_buf_unwritten); - atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, - &j->reservations.counter); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(new.idx == new.unwritten_idx); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); closure_wake_up(&w->wait); journal_wake(j); @@ -1008,6 +1020,10 @@ static void journal_write_done(struct closure *cl) if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) mod_delayed_work(system_freezable_wq, &j->write_work, 0); spin_unlock(&j->lock); + + if (new.unwritten_idx != new.idx && + !journal_state_count(new, new.unwritten_idx)) + closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); } static void journal_write_endio(struct bio *bio) @@ -1018,7 +1034,7 @@ static void journal_write_endio(struct bio *bio) if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); @@ -1035,7 +1051,7 @@ void bch2_journal_write(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct journal_buf *w = journal_prev_buf(j); + struct journal_buf *w = journal_last_unwritten_buf(j); struct jset_entry *start, *end; struct jset *jset; struct bio *bio; @@ -1046,8 +1062,6 @@ void bch2_journal_write(struct closure *cl) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); - bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); - journal_buf_realloc(j, w); jset = w->data; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 4fd2b272e04e..c50352385a47 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -58,6 +58,19 @@ static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) old.v, new.v)) != old.v); } +static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx) +{ + unsigned sectors = 0; + + while (!sectors && *idx != j->reservations.idx) { + sectors = j->buf[*idx].sectors; + + *idx = (*idx + 1) & JOURNAL_BUF_MASK; + } + + return sectors; +} + static struct journal_space { unsigned next_entry; unsigned remaining; @@ -69,15 +82,14 @@ static struct journal_space { unsigned sectors_next_entry = UINT_MAX; unsigned sectors_total = UINT_MAX; unsigned i, nr_devs = 0; - unsigned unwritten_sectors = j->reservations.prev_buf_unwritten - ? journal_prev_buf(j)->sectors - : 0; + unsigned unwritten_sectors; rcu_read_lock(); for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; unsigned buckets_this_device, sectors_this_device; + unsigned idx = j->reservations.unwritten_idx; if (!ja->nr) continue; @@ -89,16 +101,20 @@ static struct journal_space { * We that we don't allocate the space for a journal entry * until we write it out - thus, account for it here: */ - if (unwritten_sectors >= sectors_this_device) { - if (!buckets_this_device) - continue; + while ((unwritten_sectors = get_unwritten_sectors(j, &idx))) { + if (unwritten_sectors >= sectors_this_device) { + if (!buckets_this_device) { + sectors_this_device = 0; + break; + } - buckets_this_device--; - sectors_this_device = ca->mi.bucket_size; + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; + } + + sectors_this_device -= unwritten_sectors; } - sectors_this_device -= unwritten_sectors; - if (sectors_this_device < ca->mi.bucket_size && buckets_this_device) { buckets_this_device--; @@ -277,6 +293,14 @@ static void bch2_journal_reclaim_fast(struct journal *j) bch2_journal_space_available(j); } +void __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) + bch2_journal_reclaim_fast(j); +} + void bch2_journal_pin_put(struct journal *j, u64 seq) { struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h index 3404fef241ea..b0f05839396d 100644 --- a/fs/bcachefs/journal_reclaim.h +++ b/fs/bcachefs/journal_reclaim.h @@ -39,6 +39,7 @@ journal_seq_pin(struct journal *j, u64 seq) return &j->pin.data[seq & j->pin.mask]; } +void __bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_put(struct journal *, u64); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 7e328ccc0a8f..ec19f75f8ede 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -11,13 +11,13 @@ struct journal_res; -#define JOURNAL_BUF_BITS 1 +#define JOURNAL_BUF_BITS 2 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) /* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. */ struct journal_buf { struct jset *data; @@ -85,10 +85,12 @@ union journal_res_state { struct { u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; }; }; @@ -169,7 +171,7 @@ struct journal { * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ - struct journal_buf buf[2]; + struct journal_buf buf[JOURNAL_BUF_NR]; spinlock_t lock; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d24cef2bf1aa..7ad5b8234747 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1048,13 +1048,13 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->sb.clean) { ret = bch2_journal_seq_blacklist_add(c, journal_seq, - journal_seq + 4); + journal_seq + 8); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); goto err; } - journal_seq += 4; + journal_seq += 8; /* * The superblock needs to be written before we do any btree