linux-stable/drivers/md/bcache/bcache_ondisk.h

447 lines
9.8 KiB
C
Raw Normal View History

License cleanup: add SPDX license identifier to uapi header files with no license Many user space API headers are missing licensing information, which makes it hard for compliance tools to determine the correct license. By default are files without license information under the default license of the kernel, which is GPLV2. Marking them GPLV2 would exclude them from being included in non GPLV2 code, which is obviously not intended. The user space API headers fall under the syscall exception which is in the kernels COPYING file: NOTE! This copyright does *not* cover user programs that use kernel services by normal system calls - this is merely considered normal use of the kernel, and does *not* fall under the heading of "derived work". otherwise syscall usage would not be possible. Update the files which contain no license information with an SPDX license identifier. The chosen identifier is 'GPL-2.0 WITH Linux-syscall-note' which is the officially assigned identifier for the Linux syscall exception. SPDX license identifiers are a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. See the previous patch in this series for the methodology of how this patch was researched. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2017-11-01 14:08:43 +00:00
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _LINUX_BCACHE_H
#define _LINUX_BCACHE_H
/*
* Bcache on disk data structures
*/
#include <linux/types.h>
#define BITMASK(name, type, field, offset, size) \
static inline __u64 name(const type *k) \
{ return (k->field >> offset) & ~(~0ULL << size); } \
\
static inline void SET_##name(type *k, __u64 v) \
{ \
k->field &= ~(~(~0ULL << size) << offset); \
k->field |= (v & ~(~0ULL << size)) << offset; \
}
/* Btree keys - all units are in sectors */
struct bkey {
__u64 high;
__u64 low;
__u64 ptr[];
};
#define KEY_FIELD(name, field, offset, size) \
BITMASK(name, struct bkey, field, offset, size)
#define PTR_FIELD(name, offset, size) \
static inline __u64 name(const struct bkey *k, unsigned int i) \
{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \
\
static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \
{ \
k->ptr[i] &= ~(~(~0ULL << size) << offset); \
k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \
}
#define KEY_SIZE_BITS 16
#define KEY_MAX_U64S 8
KEY_FIELD(KEY_PTRS, high, 60, 3)
KEY_FIELD(__PAD0, high, 58, 2)
KEY_FIELD(KEY_CSUM, high, 56, 2)
KEY_FIELD(__PAD1, high, 55, 1)
KEY_FIELD(KEY_DIRTY, high, 36, 1)
KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS)
KEY_FIELD(KEY_INODE, high, 0, 20)
/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
static inline __u64 KEY_OFFSET(const struct bkey *k)
{
return k->low;
}
static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
{
k->low = v;
}
/*
* The high bit being set is a relic from when we used it to do binary
* searches - it told you where a key started. It's not used anymore,
* and can probably be safely dropped.
*/
#define KEY(inode, offset, size) \
((struct bkey) { \
.high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \
.low = (offset) \
})
#define ZERO_KEY KEY(0, 0, 0)
#define MAX_KEY_INODE (~(~0 << 20))
#define MAX_KEY_OFFSET (~0ULL >> 1)
#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k))
#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0)
#define PTR_DEV_BITS 12
PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS)
PTR_FIELD(PTR_OFFSET, 8, 43)
PTR_FIELD(PTR_GEN, 0, 8)
#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1)
#define MAKE_PTR(gen, offset, dev) \
((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
/* Bkey utility code */
static inline unsigned long bkey_u64s(const struct bkey *k)
{
return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
}
static inline unsigned long bkey_bytes(const struct bkey *k)
{
return bkey_u64s(k) * sizeof(__u64);
}
#define bkey_copy(_dest, _src) unsafe_memcpy(_dest, _src, bkey_bytes(_src), \
/* bkey is always padded */)
static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
{
SET_KEY_INODE(dest, KEY_INODE(src));
SET_KEY_OFFSET(dest, KEY_OFFSET(src));
}
static inline struct bkey *bkey_next(const struct bkey *k)
{
__u64 *d = (void *) k;
return (struct bkey *) (d + bkey_u64s(k));
}
static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
{
__u64 *d = (void *) k;
return (struct bkey *) (d + nr_keys);
}
/* Enough for a key with 6 pointers */
#define BKEY_PAD 8
#define BKEY_PADDED(key) \
union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
/* Superblock */
/* Version 0: Cache device
* Version 1: Backing device
* Version 2: Seed pointer into btree node checksum
* Version 3: Cache device with new UUID format
* Version 4: Backing device with data offset
*/
#define BCACHE_SB_VERSION_CDEV 0
#define BCACHE_SB_VERSION_BDEV 1
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5
#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6
#define BCACHE_SB_MAX_VERSION 6
#define SB_SECTOR 8
#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT)
#define SB_SIZE 4096
#define SB_LABEL_SIZE 32
#define SB_JOURNAL_BUCKETS 256U
/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
#define MAX_CACHES_PER_SET 8
#define BDEV_DATA_START_DEFAULT 16 /* sectors */
struct cache_sb_disk {
__le64 csum;
__le64 offset; /* sector where this sb was written */
__le64 version;
__u8 magic[16];
__u8 uuid[16];
union {
__u8 set_uuid[16];
__le64 set_magic;
};
__u8 label[SB_LABEL_SIZE];
__le64 flags;
__le64 seq;
__le64 feature_compat;
__le64 feature_incompat;
__le64 feature_ro_compat;
__le64 pad[5];
union {
struct {
/* Cache devices */
__le64 nbuckets; /* device size */
__le16 block_size; /* sectors */
__le16 bucket_size; /* sectors */
__le16 nr_in_set;
__le16 nr_this_dev;
};
struct {
/* Backing devices */
__le64 data_offset;
/*
* block_size from the cache device section is still used by
* backing devices, so don't add anything here until we fix
* things to not need it for backing devices anymore
*/
};
};
__le32 last_mount; /* time overflow in y2106 */
__le16 first_bucket;
union {
__le16 njournal_buckets;
__le16 keys;
};
__le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
bcache: introduce BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE for large bucket When large bucket feature was added, BCH_FEATURE_INCOMPAT_LARGE_BUCKET was introduced into the incompat feature set. It used bucket_size_hi (which was added at the tail of struct cache_sb_disk) to extend current 16bit bucket size to 32bit with existing bucket_size in struct cache_sb_disk. This is not a good idea, there are two obvious problems, - Bucket size is always value power of 2, if store log2(bucket size) in existing bucket_size of struct cache_sb_disk, it is unnecessary to add bucket_size_hi. - Macro csum_set() assumes d[SB_JOURNAL_BUCKETS] is the last member in struct cache_sb_disk, bucket_size_hi was added after d[] which makes csum_set calculate an unexpected super block checksum. To fix the above problems, this patch introduces a new incompat feature bit BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, when this bit is set, it means bucket_size in struct cache_sb_disk stores the order of power-of-2 bucket size value. When user specifies a bucket size larger than 32768 sectors, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE will be set to incompat feature set, and bucket_size stores log2(bucket size) more than store the real bucket size value. The obsoleted BCH_FEATURE_INCOMPAT_LARGE_BUCKET won't be used anymore, it is renamed to BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET and still only recognized by kernel driver for legacy compatible purpose. The previous bucket_size_hi is renmaed to obso_bucket_size_hi in struct cache_sb_disk and not used in bcache-tools anymore. For cache device created with BCH_FEATURE_INCOMPAT_LARGE_BUCKET feature, bcache-tools and kernel driver still recognize the feature string and display it as "obso_large_bucket". With this change, the unnecessary extra space extend of bcache on-disk super block can be avoided, and csum_set() may generate expected check sum as well. Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket") Signed-off-by: Coly Li <colyli@suse.de> Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-01-04 07:41:21 +00:00
__le16 obso_bucket_size_hi; /* obsoleted */
};
/*
* This is for in-memory bcache super block.
* NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member
* size, ordering and even whole struct size may be different
* from cache_sb_disk.
*/
struct cache_sb {
__u64 offset; /* sector where this sb was written */
__u64 version;
__u8 magic[16];
__u8 uuid[16];
union {
__u8 set_uuid[16];
__u64 set_magic;
};
__u8 label[SB_LABEL_SIZE];
__u64 flags;
__u64 seq;
__u64 feature_compat;
__u64 feature_incompat;
__u64 feature_ro_compat;
union {
struct {
/* Cache devices */
__u64 nbuckets; /* device size */
__u16 block_size; /* sectors */
__u16 nr_in_set;
__u16 nr_this_dev;
bcache: add bucket_size_hi into struct cache_sb_disk for large bucket The large bucket feature is to extend bucket_size from 16bit to 32bit. When create cache device on zoned device (e.g. zoned NVMe SSD), making a single bucket cover one or more zones of the zoned device is the simplest way to support zoned device as cache by bcache. But current maximum bucket size is 16MB and a typical zone size of zoned device is 256MB, this is the major motiviation to extend bucket size to a larger bit width. This patch is the basic and first change to support large bucket size, the major changes it makes are, - Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature, INCOMPAT means it introduces incompatible on-disk format change. - Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines. - Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0 for the on-disk super block format. - For the in-memory super block struct cache_sb, member bucket_size is extended from __u16 to __32. - Add get_bucket_size() to combine the bucket_size and bucket_size_hi from struct cache_sb_disk into an unsigned int value. Since we already have large bucket size helpers meta_bucket_pages(), meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when bucket size > 8MB, the memory allocation for bcache meta data bucket won't fail no matter how large the bucket size extended. So these meta data buckets are handled properly when the bucket size width increase from 16bit to 32bit, we don't need to worry about them. Signed-off-by: Coly Li <colyli@suse.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 12:00:35 +00:00
__u32 bucket_size; /* sectors */
};
struct {
/* Backing devices */
__u64 data_offset;
/*
* block_size from the cache device section is still used by
* backing devices, so don't add anything here until we fix
* things to not need it for backing devices anymore
*/
};
};
__u32 last_mount; /* time overflow in y2106 */
__u16 first_bucket;
union {
__u16 njournal_buckets;
__u16 keys;
};
__u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
};
static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
{
return sb->version == BCACHE_SB_VERSION_BDEV
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES;
}
BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1);
BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3);
#define CACHE_REPLACEMENT_LRU 0U
#define CACHE_REPLACEMENT_FIFO 1U
#define CACHE_REPLACEMENT_RANDOM 2U
BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
#define CACHE_MODE_WRITETHROUGH 0U
#define CACHE_MODE_WRITEBACK 1U
#define CACHE_MODE_WRITEAROUND 2U
#define CACHE_MODE_NONE 3U
BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
#define BDEV_STATE_NONE 0U
#define BDEV_STATE_CLEAN 1U
#define BDEV_STATE_DIRTY 2U
#define BDEV_STATE_STALE 3U
/*
* Magic numbers
*
* The various other data structures have their own magic numbers, which are
* xored with the first part of the cache set's UUID
*/
#define JSET_MAGIC 0x245235c1a3625032ULL
#define PSET_MAGIC 0x6750e15f87337f91ULL
#define BSET_MAGIC 0x90135c78b99e07f5ULL
static inline __u64 jset_magic(struct cache_sb *sb)
{
return sb->set_magic ^ JSET_MAGIC;
}
static inline __u64 pset_magic(struct cache_sb *sb)
{
return sb->set_magic ^ PSET_MAGIC;
}
static inline __u64 bset_magic(struct cache_sb *sb)
{
return sb->set_magic ^ BSET_MAGIC;
}
/*
* Journal
*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
* sequence number.
*
* last_seq is the oldest journal entry that still has keys the btree hasn't
* flushed to disk yet.
*
* version is for on disk format changes.
*/
#define BCACHE_JSET_VERSION_UUIDv1 1
#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */
#define BCACHE_JSET_VERSION 1
struct jset {
__u64 csum;
__u64 magic;
__u64 seq;
__u32 version;
__u32 keys;
__u64 last_seq;
BKEY_PADDED(uuid_bucket);
BKEY_PADDED(btree_root);
__u16 btree_level;
__u16 pad[3];
__u64 prio_bucket[MAX_CACHES_PER_SET];
union {
DECLARE_FLEX_ARRAY(struct bkey, start);
DECLARE_FLEX_ARRAY(__u64, d);
};
};
/* Bucket prios/gens */
struct prio_set {
__u64 csum;
__u64 magic;
__u64 seq;
__u32 version;
__u32 pad;
__u64 next_bucket;
struct bucket_disk {
__u16 prio;
__u8 gen;
} __attribute((packed)) data[];
};
/* UUIDS - per backing device/flash only volume metadata */
struct uuid_entry {
union {
struct {
__u8 uuid[16];
__u8 label[32];
__u32 first_reg; /* time overflow in y2106 */
__u32 last_reg;
__u32 invalidated;
__u32 flags;
/* Size of flash only volumes */
__u64 sectors;
};
__u8 pad[128];
};
};
BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1);
/* Btree nodes */
/* Version 1: Seed pointer into btree node checksum
*/
#define BCACHE_BSET_CSUM 1
#define BCACHE_BSET_VERSION 1
/*
* Btree nodes
*
* On disk a btree node is a list/log of these; within each set the keys are
* sorted
*/
struct bset {
__u64 csum;
__u64 magic;
__u64 seq;
__u32 version;
__u32 keys;
union {
DECLARE_FLEX_ARRAY(struct bkey, start);
DECLARE_FLEX_ARRAY(__u64, d);
};
};
/* OBSOLETE */
/* UUIDS - per backing device/flash only volume metadata */
struct uuid_entry_v0 {
__u8 uuid[16];
__u8 label[32];
__u32 first_reg;
__u32 last_reg;
__u32 invalidated;
__u32 pad;
};
#endif /* _LINUX_BCACHE_H */