linux-stable/fs/btrfs/extent-io-tree.c
Josef Bacik 5a75034e71 btrfs: do not panic if we can't allocate a prealloc extent state
We sometimes have to allocate new extent states when clearing or setting
new bits in an extent io tree.  Generally we preallocate this before
taking the tree spin lock, but we can use this preallocated extent state
sometimes and then need to try to do a GFP_ATOMIC allocation under the
lock.

Unfortunately sometimes this fails, and then we hit the BUG_ON() and
bring the box down.  This happens roughly 20 times a week in our fleet.

However the vast majority of callers use GFP_NOFS, which means that if
this GFP_ATOMIC allocation fails, we could simply drop the spin lock, go
back and allocate a new extent state with our given gfp mask, and begin
again from where we left off.

For the remaining callers that do not use GFP_NOFS, they are generally
using GFP_NOWAIT, which still allows for some reclaim.  So allow these
allocations to attempt to happen outside of the spin lock so we don't
need to rely on GFP_ATOMIC allocations.

This in essence creates an infinite loop for anything that isn't
GFP_NOFS.  To address this we may want to migrate to using mempools for
extent states so that we will always have emergency reserves in order to
make our allocations.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-12-05 18:00:41 +01:00

1707 lines
44 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <trace/events/btrfs.h>
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
#include "misc.h"
static struct kmem_cache *extent_state_cache;
static inline bool extent_state_in_tree(const struct extent_state *state)
{
return !RB_EMPTY_NODE(&state->rb_node);
}
#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(states);
static DEFINE_SPINLOCK(leak_lock);
static inline void btrfs_leak_debug_add_state(struct extent_state *state)
{
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
list_add(&state->leak_list, &states);
spin_unlock_irqrestore(&leak_lock, flags);
}
static inline void btrfs_leak_debug_del_state(struct extent_state *state)
{
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
list_del(&state->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
}
static inline void btrfs_extent_state_leak_debug_check(void)
{
struct extent_state *state;
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
struct inode *inode = tree->private_data;
u64 isize;
if (!inode)
return;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
"%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
}
}
#else
#define btrfs_leak_debug_add_state(state) do {} while (0)
#define btrfs_leak_debug_del_state(state) do {} while (0)
#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
/*
* For the file_extent_tree, we want to hold the inode lock when we lookup and
* update the disk_i_size, but lockdep will complain because our io_tree we hold
* the tree lock and get the inode lock when setting delalloc. These two things
* are unrelated, so make a class for the file_extent_tree so we don't get the
* two locking patterns mixed up.
*/
static struct lock_class_key file_extent_tree_class;
struct tree_entry {
u64 start;
u64 end;
struct rb_node rb_node;
};
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->owner = owner;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
void extent_io_tree_release(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
/*
* Do a single barrier for the waitqueue_active check here, the state
* of the waitqueue should not change once extent_io_tree_release is
* called.
*/
smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
node = rb_first(&tree->state);
state = rb_entry(node, struct extent_state, rb_node);
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
/*
* btree io trees aren't supposed to have tasks waiting for
* changes in the flags of extent states ever.
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
cond_resched_lock(&tree->lock);
}
spin_unlock(&tree->lock);
}
static struct extent_state *alloc_extent_state(gfp_t mask)
{
struct extent_state *state;
/*
* The given mask might be not appropriate for the slab allocator,
* drop the unsupported bits
*/
mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
state = kmem_cache_alloc(extent_state_cache, mask);
if (!state)
return state;
state->state = 0;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add_state(state);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
trace_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
{
if (!prealloc)
prealloc = alloc_extent_state(GFP_ATOMIC);
return prealloc;
}
void free_extent_state(struct extent_state *state)
{
if (!state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del_state(state);
trace_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
static int add_extent_changeset(struct extent_state *state, u32 bits,
struct extent_changeset *changeset,
int set)
{
int ret;
if (!changeset)
return 0;
if (set && (state->state & bits) == bits)
return 0;
if (!set && (state->state & bits) == 0)
return 0;
changeset->bytes_changed += state->end - state->start + 1;
ret = ulist_add(&changeset->range_changed, state->start, state->end,
GFP_ATOMIC);
return ret;
}
static inline struct extent_state *next_state(struct extent_state *state)
{
struct rb_node *next = rb_next(&state->rb_node);
if (next)
return rb_entry(next, struct extent_state, rb_node);
else
return NULL;
}
static inline struct extent_state *prev_state(struct extent_state *state)
{
struct rb_node *next = rb_prev(&state->rb_node);
if (next)
return rb_entry(next, struct extent_state, rb_node);
else
return NULL;
}
/*
* Search @tree for an entry that contains @offset. Such entry would have
* entry->start <= offset && entry->end >= offset.
*
* @tree: the tree to search
* @offset: offset that should fall within an entry in @tree
* @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
* Return a pointer to the entry that contains @offset byte address and don't change
* @node_ret and @parent_ret.
*
* If no such entry exists, return pointer to entry that ends before @offset
* and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
*/
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
u64 offset,
struct rb_node ***node_ret,
struct rb_node **parent_ret)
{
struct rb_root *root = &tree->state;
struct rb_node **node = &root->rb_node;
struct rb_node *prev = NULL;
struct extent_state *entry = NULL;
while (*node) {
prev = *node;
entry = rb_entry(prev, struct extent_state, rb_node);
if (offset < entry->start)
node = &(*node)->rb_left;
else if (offset > entry->end)
node = &(*node)->rb_right;
else
return entry;
}
if (node_ret)
*node_ret = node;
if (parent_ret)
*parent_ret = prev;
/* Search neighbors until we find the first one past the end */
while (entry && offset > entry->end)
entry = next_state(entry);
return entry;
}
/*
* Search offset in the tree or fill neighbor rbtree node pointers.
*
* @tree: the tree to search
* @offset: offset that should fall within an entry in @tree
* @next_ret: pointer to the first entry whose range ends after @offset
* @prev_ret: pointer to the first entry whose range begins before @offset
*
* Return a pointer to the entry that contains @offset byte address. If no
* such entry exists, then return NULL and fill @prev_ret and @next_ret.
* Otherwise return the found entry and other pointers are left untouched.
*/
static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
u64 offset,
struct extent_state **prev_ret,
struct extent_state **next_ret)
{
struct rb_root *root = &tree->state;
struct rb_node **node = &root->rb_node;
struct extent_state *orig_prev;
struct extent_state *entry = NULL;
ASSERT(prev_ret);
ASSERT(next_ret);
while (*node) {
entry = rb_entry(*node, struct extent_state, rb_node);
if (offset < entry->start)
node = &(*node)->rb_left;
else if (offset > entry->end)
node = &(*node)->rb_right;
else
return entry;
}
orig_prev = entry;
while (entry && offset > entry->end)
entry = next_state(entry);
*next_ret = entry;
entry = orig_prev;
while (entry && offset < entry->start)
entry = prev_state(entry);
*prev_ret = entry;
return NULL;
}
/*
* Inexact rb-tree search, return the next entry if @offset is not found
*/
static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
{
return tree_search_for_insert(tree, offset, NULL, NULL);
}
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
{
btrfs_panic(tree->fs_info, err,
"locking error: extent tree was modified by another thread while locked");
}
/*
* Utility function to look for merge candidates inside a given range. Any
* extents with matching state are merged together into a single extent in the
* tree. Extents with EXTENT_IO in their state field are not merged because
* the end_io handlers need to be able to do operations on them without
* sleeping (or doing allocations/splits).
*
* This should be called with the tree lock held.
*/
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
struct extent_state *other;
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
return;
other = prev_state(state);
if (other && other->end == state->start - 1 &&
other->state == state->state) {
if (tree->private_data)
btrfs_merge_delalloc_extent(tree->private_data,
state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
other = next_state(state);
if (other && other->start == state->end + 1 &&
other->state == state->state) {
if (tree->private_data)
btrfs_merge_delalloc_extent(tree->private_data, state,
other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
free_extent_state(other);
}
}
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
u32 bits, struct extent_changeset *changeset)
{
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data)
btrfs_set_delalloc_extent(tree->private_data, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
state->state |= bits_to_set;
}
/*
* Insert an extent_state struct into the tree. 'bits' are set on the
* struct before it is inserted.
*
* This may return -EEXIST if the extent is already there, in which case the
* state struct is freed.
*
* The tree lock is not taken internally. This is a utility function and
* probably isn't what you want to call (see set/clear_extent_bit).
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state,
u32 bits, struct extent_changeset *changeset)
{
struct rb_node **node;
struct rb_node *parent;
const u64 end = state->end;
set_state_bits(tree, state, bits, changeset);
node = &tree->state.rb_node;
while (*node) {
struct extent_state *entry;
parent = *node;
entry = rb_entry(parent, struct extent_state, rb_node);
if (end < entry->start) {
node = &(*node)->rb_left;
} else if (end > entry->end) {
node = &(*node)->rb_right;
} else {
btrfs_err(tree->fs_info,
"found node %llu %llu on insert of %llu %llu",
entry->start, entry->end, state->start, end);
return -EEXIST;
}
}
rb_link_node(&state->rb_node, parent, node);
rb_insert_color(&state->rb_node, &tree->state);
merge_state(tree, state);
return 0;
}
/*
* Insert state to @tree to the location given by @node and @parent.
*/
static void insert_state_fast(struct extent_io_tree *tree,
struct extent_state *state, struct rb_node **node,
struct rb_node *parent, unsigned bits,
struct extent_changeset *changeset)
{
set_state_bits(tree, state, bits, changeset);
rb_link_node(&state->rb_node, parent, node);
rb_insert_color(&state->rb_node, &tree->state);
merge_state(tree, state);
}
/*
* Split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
* offset inside 'orig' where it should be split.
*
* Before calling,
* the tree has 'orig' at [orig->start, orig->end]. After calling, there
* are two extent state structs in the tree:
* prealloc: [orig->start, split - 1]
* orig: [ split, orig->end ]
*
* The tree locks are not taken by this function. They need to be held
* by the caller.
*/
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *parent = NULL;
struct rb_node **node;
if (tree->private_data)
btrfs_split_delalloc_extent(tree->private_data, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
orig->start = split;
parent = &orig->rb_node;
node = &parent;
while (*node) {
struct extent_state *entry;
parent = *node;
entry = rb_entry(parent, struct extent_state, rb_node);
if (prealloc->end < entry->start) {
node = &(*node)->rb_left;
} else if (prealloc->end > entry->end) {
node = &(*node)->rb_right;
} else {
free_extent_state(prealloc);
return -EEXIST;
}
}
rb_link_node(&prealloc->rb_node, parent, node);
rb_insert_color(&prealloc->rb_node, &tree->state);
return 0;
}
/*
* Utility function to clear some bits in an extent state struct. It will
* optionally wake up anyone waiting on this state (wake == 1).
*
* If no bits are set on the state struct after clearing things, the
* struct is freed and removed from the tree
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
u32 bits, int wake,
struct extent_changeset *changeset)
{
struct extent_state *next;
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data)
btrfs_clear_delalloc_extent(tree->private_data, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
if (state->state == 0) {
next = next_state(state);
if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
free_extent_state(state);
} else {
WARN_ON(1);
}
} else {
merge_state(tree, state);
next = next_state(state);
}
return next;
}
/*
* Clear some bits on a range in the tree. This may require splitting or
* inserting elements in the tree, so the gfp mask is used to indicate which
* allocations or sleeping are allowed.
*
* Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
* range from the tree regardless of state (ie for truncate).
*
* The range [start, end] is inclusive.
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached_state,
gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
u64 last_end;
int err;
int clear = 0;
int wake;
int delete = (bits & EXTENT_CLEAR_ALL_BITS);
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
if (delete)
bits |= ~EXTENT_CTLBITS;
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
wake = (bits & EXTENT_LOCKED) ? 1 : 0;
if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
* is the case if we only have in the tree extent states that
* cover our input range and don't cover too any other range.
* If we end up needing a new extent state we allocate it later.
*/
prealloc = alloc_extent_state(mask);
}
spin_lock(&tree->lock);
if (cached_state) {
cached = *cached_state;
if (clear) {
*cached_state = NULL;
cached_state = NULL;
}
if (cached && extent_state_in_tree(cached) &&
cached->start <= start && cached->end > start) {
if (clear)
refcount_dec(&cached->refs);
state = cached;
goto hit_next;
}
if (clear)
free_extent_state(cached);
}
/* This search will find the extents that end after our range starts. */
state = tree_search(tree, start);
if (!state)
goto out;
hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
last_end = state->end;
/* The state doesn't have the wanted bits, go ahead. */
if (!(state->state & bits)) {
state = next_state(state);
goto next;
}
/*
* | ---- desired range ---- |
* | state | or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on second
* half.
*
* If the extent we found extends past our range, we just split and
* search again. It'll get split again the next time though.
*
* If the extent we found is inside our range, we clear the desired bit
* on it.
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
state = clear_state_bit(tree, state, bits, wake, changeset);
goto next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* We need to split the extent, and clear the bit on the first half.
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
if (wake)
wake_up(&state->wq);
clear_state_bit(tree, prealloc, bits, wake, changeset);
prealloc = NULL;
goto out;
}
state = clear_state_bit(tree, state, bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start <= end && state && !need_resched())
goto hit_next;
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
return 0;
}
static void wait_on_state(struct extent_io_tree *tree,
struct extent_state *state)
__releases(tree->lock)
__acquires(tree->lock)
{
DEFINE_WAIT(wait);
prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock(&tree->lock);
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
}
/*
* Wait for one or more bits to clear on a range in the state tree.
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
btrfs_debug_check_extent_io_range(tree, start, end);
spin_lock(&tree->lock);
again:
/*
* Maintain cached_state, as we may not remove it from the tree if there
* are more bits than the bits we're waiting on set on this state.
*/
if (cached_state && *cached_state) {
state = *cached_state;
if (extent_state_in_tree(state) &&
state->start <= start && start < state->end)
goto process_node;
}
while (1) {
/*
* This search will find all the extents that end after our
* range starts.
*/
state = tree_search(tree, start);
process_node:
if (!state)
break;
if (state->start > end)
goto out;
if (state->state & bits) {
start = state->start;
refcount_inc(&state->refs);
wait_on_state(tree, state);
free_extent_state(state);
goto again;
}
start = state->end + 1;
if (start > end)
break;
if (!cond_resched_lock(&tree->lock)) {
state = next_state(state);
goto process_node;
}
}
out:
/* This state is no longer useful, clear it and free it up. */
if (cached_state && *cached_state) {
state = *cached_state;
*cached_state = NULL;
free_extent_state(state);
}
spin_unlock(&tree->lock);
}
static void cache_state_if_flags(struct extent_state *state,
struct extent_state **cached_ptr,
unsigned flags)
{
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
*cached_ptr = state;
refcount_inc(&state->refs);
}
}
}
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
return cache_state_if_flags(state, cached_ptr,
EXTENT_LOCKED | EXTENT_BOUNDARY);
}
/*
* Find the first state struct with 'bits' set after 'start', and return it.
* tree->lock must be held. NULL will returned if nothing was found after
* 'start'.
*/
static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
u64 start, u32 bits)
{
struct extent_state *state;
/*
* This search will find all the extents that end after our range
* starts.
*/
state = tree_search(tree, start);
while (state) {
if (state->end >= start && (state->state & bits))
return state;
state = next_state(state);
}
return NULL;
}
/*
* Find the first offset in the io tree with one or more @bits set.
*
* Note: If there are multiple bits set in @bits, any of them will match.
*
* Return 0 if we find something, and update @start_ret and @end_ret.
* Return 1 if we found nothing.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
}
free_extent_state(*cached_state);
*cached_state = NULL;
goto out;
}
free_extent_state(*cached_state);
*cached_state = NULL;
}
state = find_first_extent_bit_state(tree, start, bits);
got_it:
if (state) {
cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
out:
spin_unlock(&tree->lock);
return ret;
}
/*
* Find a contiguous area of bits
*
* @tree: io tree to check
* @start: offset to start the search from
* @start_ret: the first offset we found with the bits set
* @end_ret: the final contiguous range of the bits that were set
* @bits: bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
* will drop the tree->lock, so use this helper if you want to find the actual
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
*/
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
int ret = 1;
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
*start_ret = state->start;
*end_ret = state->end;
while ((state = next_state(state)) != NULL) {
if (state->start > (*end_ret + 1))
break;
*end_ret = state->end;
}
ret = 0;
}
spin_unlock(&tree->lock);
return ret;
}
/*
* Find a contiguous range of bytes in the file marked as delalloc, not more
* than 'max_bytes'. start and end are used to return the range,
*
* True is returned if we find something, false if nothing was in the tree.
*/
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state)
{
struct extent_state *state;
u64 cur_start = *start;
bool found = false;
u64 total_bytes = 0;
spin_lock(&tree->lock);
/*
* This search will find all the extents that end after our range
* starts.
*/
state = tree_search(tree, cur_start);
if (!state) {
*end = (u64)-1;
goto out;
}
while (state) {
if (found && (state->start != cur_start ||
(state->state & EXTENT_BOUNDARY))) {
goto out;
}
if (!(state->state & EXTENT_DELALLOC)) {
if (!found)
*end = state->end;
goto out;
}
if (!found) {
*start = state->start;
*cached_state = state;
refcount_inc(&state->refs);
}
found = true;
*end = state->end;
cur_start = state->end + 1;
total_bytes += state->end - state->start + 1;
if (total_bytes >= max_bytes)
break;
state = next_state(state);
}
out:
spin_unlock(&tree->lock);
return found;
}
/*
* Set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
*
* If any of the exclusive bits are set, this will fail with -EEXIST if some
* part of the range already has the desired bits set. The extent_state of the
* existing range is returned in failed_state in this case, and the start of the
* existing range is returned in failed_start. failed_state is used as an
* optimization for wait_extent_bit, failed_start must be used as the source of
* truth as failed_state may have changed since we returned.
*
* [start, end] is inclusive This takes the tree lock.
*/
static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u64 *failed_start,
struct extent_state **failed_state,
struct extent_state **cached_state,
struct extent_changeset *changeset, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node **p;
struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
u32 exclusive_bits = (bits & EXTENT_LOCKED);
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
if (exclusive_bits)
ASSERT(failed_start);
else
ASSERT(failed_start == NULL && failed_state == NULL);
again:
if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
* is the case if we only have in the tree extent states that
* cover our input range and don't cover too any other range.
* If we end up needing a new extent state we allocate it later.
*/
prealloc = alloc_extent_state(mask);
}
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
extent_state_in_tree(state))
goto hit_next;
}
/*
* This search will find all the extents that end after our range
* starts.
*/
state = tree_search_for_insert(tree, start, &p, &parent);
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
prealloc->start = start;
prealloc->end = end;
insert_state_fast(tree, prealloc, p, parent, bits, changeset);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
}
hit_next:
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
if (state->state & exclusive_bits) {
*failed_start = state->start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
state = next_state(state);
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on second
* half.
*
* If the extent we found extends past our range, we just split and
* search again. It'll get split again the next time though.
*
* If the extent we found is inside our range, we set the desired bit
* on it.
*/
if (state->start < start) {
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
/*
* If this extent already has all the bits we want set, then
* skip it, not necessary to split it or do anything with it.
*/
if ((state->state & bits) == bits) {
start = state->end + 1;
cache_state(state, cached_state);
goto search_again;
}
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
state = next_state(state);
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and ignore the
* extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
/*
* Avoid to free 'prealloc' if it can be merged with the later
* extent.
*/
prealloc->start = start;
prealloc->end = this_end;
err = insert_state(tree, prealloc, bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
*
* We need to split the extent, and set the bit on the first half
*/
if (state->start <= end && state->end > end) {
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
}
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
return err;
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, NULL, NULL,
cached_state, NULL, mask);
}
/*
* Convert all bits in a given range from one bit to another
*
* @tree: the io tree to search
* @start: the start offset in bytes
* @end: the end offset in bytes (inclusive)
* @bits: the bits to set in this range
* @clear_bits: the bits to clear in this range
* @cached_state: state that we're going to cache
*
* This will go through and set bits for the given range. If any states exist
* already in this range they are set with the given bit and cleared of the
* clear_bits. This is only meant to be used by things that are mergeable, ie.
* converting from say DELALLOC to DIRTY. This is not meant to be used with
* boundary bits like LOCK.
*
* All allocations are done with GFP_NOFS.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u32 clear_bits,
struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node **p;
struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
clear_bits);
again:
if (!prealloc) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
* that matches exactly the target range, in which case no
* extent state allocations are needed. We'll only know this
* after locking the tree.
*/
prealloc = alloc_extent_state(GFP_NOFS);
if (!prealloc && !first_iteration)
return -ENOMEM;
}
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start &&
extent_state_in_tree(state))
goto hit_next;
}
/*
* This search will find all the extents that end after our range
* starts.
*/
state = tree_search_for_insert(tree, start, &p, &parent);
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
prealloc->start = start;
prealloc->end = end;
insert_state_fast(tree, prealloc, p, parent, bits, NULL);
cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
}
hit_next:
last_start = state->start;
last_end = state->end;
/*
* | ---- desired range ---- |
* | state |
*
* Just lock what we found and keep going.
*/
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
* or
* | ------------- state -------------- |
*
* We need to split the extent we found, and may flip bits on second
* half.
*
* If the extent we found extends past our range, we just split and
* search again. It'll get split again the next time though.
*
* If the extent we found is inside our range, we set the desired bit
* on it.
*/
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
prealloc = NULL;
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
state = clear_state_bit(tree, state, clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start < end && state && state->start == start &&
!need_resched())
goto hit_next;
}
goto search_again;
}
/*
* | ---- desired range ---- |
* | state | or | state |
*
* There's a hole, we need to insert something in it and ignore the
* extent we found.
*/
if (state->start > start) {
u64 this_end;
if (end < last_start)
this_end = end;
else
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
/*
* Avoid to free 'prealloc' if it can be merged with the later
* extent.
*/
prealloc->start = start;
prealloc->end = this_end;
err = insert_state(tree, prealloc, bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
/*
* | ---- desired range ---- |
* | state |
*
* We need to split the extent, and set the bit on the first half.
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
cond_resched();
first_iteration = false;
goto again;
out:
spin_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
return err;
}
/*
* Find the first range that has @bits not set. This range could start before
* @start.
*
* @tree: the tree to search
* @start: offset at/after which the found extent should start
* @start_ret: records the beginning of the range
* @end_ret: records the end of the range (inclusive)
* @bits: the set of bits which must be unset
*
* Since unallocated range is also considered one which doesn't have the bits
* set it's possible that @end_ret contains -1, this happens in case the range
* spans (last_range_end, end of device]. In this case it's up to the caller to
* trim @end_ret to the appropriate size.
*/
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct extent_state *prev = NULL, *next;
spin_lock(&tree->lock);
/* Find first extent with bits cleared */
while (1) {
state = tree_search_prev_next(tree, start, &prev, &next);
if (!state && !next && !prev) {
/*
* Tree is completely empty, send full range and let
* caller deal with it
*/
*start_ret = 0;
*end_ret = -1;
goto out;
} else if (!state && !next) {
/*
* We are past the last allocated chunk, set start at
* the end of the last extent.
*/
*start_ret = prev->end + 1;
*end_ret = -1;
goto out;
} else if (!state) {
state = next;
}
/*
* At this point 'state' either contains 'start' or start is
* before 'state'
*/
if (in_range(start, state->start, state->end - state->start + 1)) {
if (state->state & bits) {
/*
* |--range with bits sets--|
* |
* start
*/
start = state->end + 1;
} else {
/*
* 'start' falls within a range that doesn't
* have the bits set, so take its start as the
* beginning of the desired range
*
* |--range with bits cleared----|
* |
* start
*/
*start_ret = state->start;
break;
}
} else {
/*
* |---prev range---|---hole/unset---|---node range---|
* |
* start
*
* or
*
* |---hole/unset--||--first node--|
* 0 |
* start
*/
if (prev)
*start_ret = prev->end + 1;
else
*start_ret = 0;
break;
}
}
/*
* Find the longest stretch from start until an entry which has the
* bits set
*/
while (state) {
if (state->end >= start && !(state->state & bits)) {
*end_ret = state->end;
} else {
*end_ret = state->start - 1;
break;
}
state = next_state(state);
}
out:
spin_unlock(&tree->lock);
}
/*
* Count the number of bytes in the tree that have a given bit(s) set. This
* can be fairly slow, except for EXTENT_DIRTY which is cached. The total
* number found is returned.
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
u32 bits, int contig)
{
struct extent_state *state;
u64 cur_start = *start;
u64 total_bytes = 0;
u64 last = 0;
int found = 0;
if (WARN_ON(search_end <= cur_start))
return 0;
spin_lock(&tree->lock);
/*
* This search will find all the extents that end after our range
* starts.
*/
state = tree_search(tree, cur_start);
while (state) {
if (state->start > search_end)
break;
if (contig && found && state->start > last + 1)
break;
if (state->end >= cur_start && (state->state & bits) == bits) {
total_bytes += min(search_end, state->end) + 1 -
max(cur_start, state->start);
if (total_bytes >= max_bytes)
break;
if (!found) {
*start = max(cur_start, state->start);
found = 1;
}
last = state->end;
} else if (contig && found) {
break;
}
state = next_state(state);
}
spin_unlock(&tree->lock);
return total_bytes;
}
/*
* Searche a range in the state tree for a given mask. If 'filled' == 1, this
* returns 1 only if every extent in the tree has the bits set. Otherwise, 1
* is returned if any bit in the range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
int bitset = 0;
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
cached->end > start)
state = cached;
else
state = tree_search(tree, start);
while (state && start <= end) {
if (filled && state->start > start) {
bitset = 0;
break;
}
if (state->start > end)
break;
if (state->state & bits) {
bitset = 1;
if (!filled)
break;
} else if (filled) {
bitset = 0;
break;
}
if (state->end == (u64)-1)
break;
start = state->end + 1;
if (start > end)
break;
state = next_state(state);
}
/* We ran out of states and were still inside of our range. */
if (filled && !state)
bitset = 0;
spin_unlock(&tree->lock);
return bitset;
}
/* Wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCKED yet, as current changeset will
* record any bits changed, so for EXTENT_LOCKED case, it will
* either fail with -EEXIST or changeset will record the whole
* range.
*/
ASSERT(!(bits & EXTENT_LOCKED));
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
changeset, GFP_NOFS);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCKED case, same reason as
* set_record_extent_bits().
*/
ASSERT(!(bits & EXTENT_LOCKED));
return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
changeset);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
NULL, cached, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached);
return 0;
}
return 1;
}
/*
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
&failed_state, cached_state, NULL, GFP_NOFS);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached_state);
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
&failed_state);
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
&failed_start, &failed_state,
cached_state, NULL, GFP_NOFS);
}
return err;
}
void __cold extent_state_free_cachep(void)
{
btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
}
int __init extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
return 0;
}