linux-stable/fs/xfs/libxfs/xfs_types.h

243 lines
7.3 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
#ifndef __XFS_TYPES_H__
#define __XFS_TYPES_H__
typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
xfs: xfs_log_force_lsn isn't passed a LSN In doing an investigation into AIL push stalls, I was looking at the log force code to see if an async CIL push could be done instead. This lead me to xfs_log_force_lsn() and looking at how it works. xfs_log_force_lsn() is only called from inode synchronisation contexts such as fsync(), and it takes the ip->i_itemp->ili_last_lsn value as the LSN to sync the log to. This gets passed to xlog_cil_force_lsn() via xfs_log_force_lsn() to flush the CIL to the journal, and then used by xfs_log_force_lsn() to flush the iclogs to the journal. The problem is that ip->i_itemp->ili_last_lsn does not store a log sequence number. What it stores is passed to it from the ->iop_committing method, which is called by xfs_log_commit_cil(). The value this passes to the iop_committing method is the CIL context sequence number that the item was committed to. As it turns out, xlog_cil_force_lsn() converts the sequence to an actual commit LSN for the related context and returns that to xfs_log_force_lsn(). xfs_log_force_lsn() overwrites it's "lsn" variable that contained a sequence with an actual LSN and then uses that to sync the iclogs. This caused me some confusion for a while, even though I originally wrote all this code a decade ago. ->iop_committing is only used by a couple of log item types, and only inode items use the sequence number it is passed. Let's clean up the API, CIL structures and inode log item to call it a sequence number, and make it clear that the high level code is using CIL sequence numbers and not on-disk LSNs for integrity synchronisation purposes. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-06-18 15:21:52 +00:00
typedef int64_t xfs_csn_t; /* CIL sequence number */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
typedef uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
/*
* New verifiers will return the instruction address of the failing check.
* NULL means everything is ok.
*/
typedef void * xfs_failaddr_t;
/*
* Null values for the types.
*/
#define NULLFSBLOCK ((xfs_fsblock_t)-1)
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
#define NULLFILEOFF ((xfs_fileoff_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
#define NULLFSINO ((xfs_ino_t)-1)
#define NULLAGINO ((xfs_agino_t)-1)
/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
* CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
* cannot be used.
*/
#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
/*
* Inode fork identifiers.
*/
#define XFS_DATA_FORK 0
#define XFS_ATTR_FORK 1
#define XFS_COW_FORK 2
#define XFS_WHICHFORK_STRINGS \
{ XFS_DATA_FORK, "data" }, \
{ XFS_ATTR_FORK, "attr" }, \
{ XFS_COW_FORK, "cow" }
/*
* Min numbers of data/attr fork btree root pointers.
*/
#define MINDBTPTRS 3
#define MINABTPTRS 2
/*
* MAXNAMELEN is the length (including the terminating null) of
* the longest permissible file (component) name.
*/
#define MAXNAMELEN 256
/*
* This enum is used in string mapping in xfs_trace.h; please keep the
* TRACE_DEFINE_ENUMs for it up to date.
*/
typedef enum {
XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
} xfs_lookup_t;
#define XFS_AG_BTREE_CMP_FORMAT_STR \
{ XFS_LOOKUP_EQi, "eq" }, \
{ XFS_LOOKUP_LEi, "le" }, \
{ XFS_LOOKUP_GEi, "ge" }
/*
* This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
* please keep the TRACE_DEFINE_ENUMs for it up to date.
*/
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
} xfs_btnum_t;
#define XFS_BTNUM_STRINGS \
{ XFS_BTNUM_BNOi, "bnobt" }, \
{ XFS_BTNUM_CNTi, "cntbt" }, \
{ XFS_BTNUM_RMAPi, "rmapbt" }, \
{ XFS_BTNUM_BMAPi, "bmbt" }, \
{ XFS_BTNUM_INOi, "inobt" }, \
{ XFS_BTNUM_FINOi, "finobt" }, \
{ XFS_BTNUM_REFCi, "refcbt" }
struct xfs_name {
const unsigned char *name;
int len;
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 10:50:09 +00:00
int type;
};
/*
* uid_t and gid_t are hard-coded to 32 bits in the inode.
* Hence, an 'id' in a dquot is 32 bits..
*/
typedef uint32_t xfs_dqid_t;
/*
* Constants for bit manipulations.
*/
#define XFS_NBBYLOG 3 /* log2(NBBY) */
#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
#define XFS_NBWORD (1 << XFS_NBWORDLOG)
#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
struct xfs_iext_cursor {
xfs: use a b+tree for the in-core extent list Replace the current linear list and the indirection array for the in-core extent list with a b+tree to avoid the need for larger memory allocations for the indirection array when lots of extents are present. The current extent list implementations leads to heavy pressure on the memory allocator when modifying files with a high extent count, and can lead to high latencies because of that. The replacement is a b+tree with a few quirks. The leaf nodes directly store the extent record in two u64 values. The encoding is a little bit different from the existing in-core extent records so that the start offset and length which are required for lookups can be retreived with simple mask operations. The inner nodes store a 64-bit key containing the start offset in the first half of the node, and the pointers to the next lower level in the second half. In either case we walk the node from the beginninig to the end and do a linear search, as that is more efficient for the low number of cache lines touched during a search (2 for the inner nodes, 4 for the leaf nodes) than a binary search. We store termination markers (zero length for the leaf nodes, an otherwise impossible high bit for the inner nodes) to terminate the key list / records instead of storing a count to use the available cache lines as efficiently as possible. One quirk of the algorithm is that while we normally split a node half and half like usual btree implementations we just spill over entries added at the very end of the list to a new node on its own. This means we get a 100% fill grade for the common cases of bulk insertion when reading an inode into memory, and when only sequentially appending to a file. The downside is a slightly higher chance of splits on the first random insertions. Both insert and removal manually recurse into the lower levels, but the bulk deletion of the whole tree is still implemented as a recursive function call, although one limited by the overall depth and with very little stack usage in every iteration. For the first few extents we dynamically grow the list from a single extent to the next powers of two until we have a first full leaf block and that building the actual tree. The code started out based on the generic lib/btree.c code from Joern Engel based on earlier work from Peter Zijlstra, but has since been rewritten beyond recognition. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-11-03 17:34:46 +00:00
struct xfs_iext_leaf *leaf;
int pos;
};
typedef enum {
XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
} xfs_exntst_t;
typedef struct xfs_bmbt_irec
{
xfs_fileoff_t br_startoff; /* starting file offset */
xfs_fsblock_t br_startblock; /* starting block number */
xfs_filblks_t br_blockcount; /* number of blocks */
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
xfs: track cow/shared record domains explicitly in xfs_refcount_irec Just prior to committing the reflink code into upstream, the xfs maintainer at the time requested that I find a way to shard the refcount records into two domains -- one for records tracking shared extents, and a second for tracking CoW staging extents. The idea here was to minimize mount time CoW reclamation by pushing all the CoW records to the right edge of the keyspace, and it was accomplished by setting the upper bit in rc_startblock. We don't allow AGs to have more than 2^31 blocks, so the bit was free. Unfortunately, this was a very late addition to the codebase, so most of the refcount record processing code still treats rc_startblock as a u32 and pays no attention to whether or not the upper bit (the cow flag) is set. This is a weakness is theoretically exploitable, since we're not fully validating the incoming metadata records. Fuzzing demonstrates practical exploits of this weakness. If the cow flag of a node block key record is corrupted, a lookup operation can go to the wrong record block and start returning records from the wrong cow/shared domain. This causes the math to go all wrong (since cow domain is still implicit in the upper bit of rc_startblock) and we can crash the kernel by tricking xfs into jumping into a nonexistent AG and tripping over xfs_perag_get(mp, <nonexistent AG>) returning NULL. To fix this, start tracking the domain as an explicit part of struct xfs_refcount_irec, adjust all refcount functions to check the domain of a returned record, and alter the function definitions to accept them where necessary. Found by fuzzing keys[2].cowflag = add in xfs/464. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-10-10 16:06:24 +00:00
enum xfs_refc_domain {
XFS_REFC_DOMAIN_SHARED = 0,
XFS_REFC_DOMAIN_COW,
};
#define XFS_REFC_DOMAIN_STRINGS \
{ XFS_REFC_DOMAIN_SHARED, "shared" }, \
{ XFS_REFC_DOMAIN_COW, "cow" }
struct xfs_refcount_irec {
xfs_agblock_t rc_startblock; /* starting block number */
xfs_extlen_t rc_blockcount; /* count of free blocks */
xfs_nlink_t rc_refcount; /* number of inodes linked here */
xfs: track cow/shared record domains explicitly in xfs_refcount_irec Just prior to committing the reflink code into upstream, the xfs maintainer at the time requested that I find a way to shard the refcount records into two domains -- one for records tracking shared extents, and a second for tracking CoW staging extents. The idea here was to minimize mount time CoW reclamation by pushing all the CoW records to the right edge of the keyspace, and it was accomplished by setting the upper bit in rc_startblock. We don't allow AGs to have more than 2^31 blocks, so the bit was free. Unfortunately, this was a very late addition to the codebase, so most of the refcount record processing code still treats rc_startblock as a u32 and pays no attention to whether or not the upper bit (the cow flag) is set. This is a weakness is theoretically exploitable, since we're not fully validating the incoming metadata records. Fuzzing demonstrates practical exploits of this weakness. If the cow flag of a node block key record is corrupted, a lookup operation can go to the wrong record block and start returning records from the wrong cow/shared domain. This causes the math to go all wrong (since cow domain is still implicit in the upper bit of rc_startblock) and we can crash the kernel by tricking xfs into jumping into a nonexistent AG and tripping over xfs_perag_get(mp, <nonexistent AG>) returning NULL. To fix this, start tracking the domain as an explicit part of struct xfs_refcount_irec, adjust all refcount functions to check the domain of a returned record, and alter the function definitions to accept them where necessary. Found by fuzzing keys[2].cowflag = add in xfs/464. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-10-10 16:06:24 +00:00
enum xfs_refc_domain rc_domain; /* shared or cow staging extent? */
};
#define XFS_RMAP_ATTR_FORK (1 << 0)
#define XFS_RMAP_BMBT_BLOCK (1 << 1)
#define XFS_RMAP_UNWRITTEN (1 << 2)
#define XFS_RMAP_KEY_FLAGS (XFS_RMAP_ATTR_FORK | \
XFS_RMAP_BMBT_BLOCK)
#define XFS_RMAP_REC_FLAGS (XFS_RMAP_UNWRITTEN)
struct xfs_rmap_irec {
xfs_agblock_t rm_startblock; /* extent start block */
xfs_extlen_t rm_blockcount; /* extent length */
uint64_t rm_owner; /* extent owner */
uint64_t rm_offset; /* offset within the owner */
unsigned int rm_flags; /* state flags */
};
/* per-AG block reservation types */
enum xfs_ag_resv_type {
XFS_AG_RESV_NONE = 0,
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
};
/* Results of scanning a btree keyspace to check occupancy. */
enum xbtree_recpacking {
/* None of the keyspace maps to records. */
XBTREE_RECPACKING_EMPTY = 0,
/* Some, but not all, of the keyspace maps to records. */
XBTREE_RECPACKING_SPARSE,
/* The entire keyspace maps to records. */
XBTREE_RECPACKING_FULL,
};
/*
* Type verifier functions
*/
struct xfs_mount;
bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
xfs_rtblock_t len);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
unsigned long long *max);
bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
#endif /* __XFS_TYPES_H__ */