linux-stable/fs/btrfs/extent-tree.h
Josef Bacik cd361199ff btrfs: wait on uncached block groups on every allocation loop
My initial fix for the generic/475 hangs was related to metadata, but
our CI testing uncovered another case where we hang for similar reasons.
We again have a task with a plug that is holding an outstanding request
that is keeping the dm device from finishing it's suspend, and that task
is stuck in the allocator.

This time it is stuck trying to allocate data, but we do not have a
block group that matches the size class.  The larger loop in the
allocator looks like this (simplified of course)

  find_free_extent
    for_each_block_group {
      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
      if (!ffe_ctl->cached)
	ffe_ctl->have_caching_bg = true;
      do_allocation()
	btrfs_wait_block_group_cache_progress();
    }

    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
      go search again;

In my earlier fix we were trying to allocate from the block group, but
we weren't waiting for the progress because we were only waiting for the
free space to be >= the amount of free space we wanted.  My fix made it
so we waited for forward progress to be made as well, so we would be
sure to wait.

This time however we did not have a block group that matched our size
class, so what was happening was this

  find_free_extent
    for_each_block_group {
      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
      if (!ffe_ctl->cached)
	ffe_ctl->have_caching_bg = true;
      if (size_class_doesn't_match())
	goto loop;
      do_allocation()
	btrfs_wait_block_group_cache_progress();
  loop:
      release_block_group(block_group);
    }

    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
      go search again;

The size_class_doesn't_match() part was true, so we'd just skip this
block group and never wait for caching, and then because we found a
caching block group we'd just go back and do the loop again.  We never
sleep and thus never flush the plug and we have the same deadlock.

Fix the logic for waiting on the block group caching to instead do it
unconditionally when we goto loop.  This takes the logic out of the
allocation step, so now the loop looks more like this

  find_free_extent
    for_each_block_group {
      ffe_ctl->cached == btrfs_block_group_cache_done(bg)
      if (!ffe_ctl->cached)
	ffe_ctl->have_caching_bg = true;
      if (size_class_doesn't_match())
	goto loop;
      do_allocation()
	btrfs_wait_block_group_cache_progress();
  loop:
      if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached &&
	  !ffe_ctl->cached) {
	 ffe_ctl->retry_uncached = true;
	 btrfs_wait_block_group_cache_progress();
      }

      release_block_group(block_group);
    }

    if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
      go search again;

This simplifies the logic a lot, and makes sure that if we're hitting
uncached block groups we're always waiting on them at some point.

I ran this through 100 iterations of generic/475, as this particular
case was harder to hit than the previous one.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-08-21 14:54:47 +02:00

151 lines
4.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
#include "misc.h"
#include "block-group.h"
struct btrfs_free_cluster;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
BTRFS_EXTENT_ALLOC_ZONED,
};
struct find_free_extent_ctl {
/* Basic allocation info */
u64 ram_bytes;
u64 num_bytes;
u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
/* Where to start the search inside the bg */
u64 search_start;
/* For clustered allocation */
u64 empty_cluster;
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
/* Allocation is called for tree-log */
bool for_treelog;
/* Allocation is called for data relocation */
bool for_data_reloc;
/* RAID index, converted from flags */
int index;
/*
* Current loop number, check find_free_extent_update_loop() for details
*/
int loop;
/*
* Set to true if we're retrying the allocation on this block group
* after waiting for caching progress, this is so that we retry only
* once before moving on to another block group.
*/
bool retry_uncached;
/* If current block group is cached */
int cached;
/* Max contiguous hole found */
u64 max_extent_size;
/* Total free space from free space cache, not always contiguous */
u64 total_free_space;
/* Found result */
u64 found_offset;
/* Hint where to start looking for an empty space */
u64 hint_byte;
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
/* Whether or not the allocator is currently following a hint */
bool hinted;
/* Size class of block groups to prefer in early loops */
enum btrfs_block_group_size_class size_class;
};
enum btrfs_inline_ref_type {
BTRFS_REF_TYPE_INVALID,
BTRFS_REF_TYPE_BLOCK,
BTRFS_REF_TYPE_DATA,
BTRFS_REF_TYPE_ANY,
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
#endif