ext4: Give symbolic names to mballoc criterias

mballoc criterias have historically been called by numbers
like CR0, CR1... however this makes it confusing to understand
what each criteria is about.

Change these criterias from numbers to symbolic names and add
relevant comments. While we are at it, also reformat and add some
comments to ext4_seq_mb_stats_show() for better readability.

Additionally, define CR_FAST which signifies the criteria
below which we can make quicker decisions like:
  * quitting early if (free block < requested len)
  * avoiding to scan free extents smaller than required len.
  * avoiding to initialize buddy cache and work with existing cache
  * limiting prefetches

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Link: https://lore.kernel.org/r/a2dc6ec5aea5e5e68cf8e788c2a964ffead9c8b0.1685449706.git.ojaswin@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Ojaswin Mujoo 2023-05-30 18:03:50 +05:30 committed by Theodore Ts'o
parent 7e170922f0
commit f52f3d2b9f
5 changed files with 210 additions and 146 deletions

View File

@ -135,16 +135,45 @@ enum SHIFT_DIRECTION {
*/ */
#define EXT4_MB_NUM_CRS 5 #define EXT4_MB_NUM_CRS 5
/* /*
* All possible allocation criterias for mballoc * All possible allocation criterias for mballoc. Lower are faster.
*/ */
enum criteria { enum criteria {
CR0, /*
CR1, * Used when number of blocks needed is a power of 2. This doesn't
CR1_5, * trigger any disk IO except prefetch and is the fastest criteria.
CR2, */
CR3, CR_POWER2_ALIGNED,
/*
* Tries to lookup in-memory data structures to find the most suitable
* group that satisfies goal request. No disk IO except block prefetch.
*/
CR_GOAL_LEN_FAST,
/*
* Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal length to
* the best available length for faster allocation.
*/
CR_BEST_AVAIL_LEN,
/*
* Reads each block group sequentially, performing disk IO if necessary, to
* find find_suitable block group. Tries to allocate goal length but might trim
* the request if nothing is found after enough tries.
*/
CR_GOAL_LEN_SLOW,
/*
* Finds the first free set of blocks and allocates those. This is only
* used in rare cases when CR_GOAL_LEN_SLOW also fails to allocate
* anything.
*/
CR_ANY_FREE,
}; };
/* criteria below which we use fast block scanning and avoid unnecessary IO */
#define CR_FAST CR_GOAL_LEN_SLOW
/* /*
* Flags used in mballoc's allocation_context flags field. * Flags used in mballoc's allocation_context flags field.
* *
@ -183,11 +212,11 @@ enum criteria {
/* Do strict check for free blocks while retrying block allocation */ /* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000 #define EXT4_MB_STRICT_CHECK 0x4000
/* Large fragment size list lookup succeeded at least once for cr = 0 */ /* Large fragment size list lookup succeeded at least once for cr = 0 */
#define EXT4_MB_CR0_OPTIMIZED 0x8000 #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
#define EXT4_MB_CR1_OPTIMIZED 0x00010000 #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */
#define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request { struct ext4_allocation_request {
/* target inode for block we're allocating */ /* target inode for block we're allocating */
@ -1553,7 +1582,7 @@ struct ext4_sb_info {
unsigned long s_mb_last_start; unsigned long s_mb_last_start;
unsigned int s_mb_prefetch; unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit; unsigned int s_mb_prefetch_limit;
unsigned int s_mb_cr1_5_max_trim_order; unsigned int s_mb_best_avail_max_trim_order;
/* stats for buddy allocator */ /* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@ -1566,9 +1595,9 @@ struct ext4_sb_info {
atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_2orders; /* 2^order hits */
atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_p2_aligned_bad_suggestions;
atomic_t s_bal_cr1_bad_suggestions; atomic_t s_bal_goal_fast_bad_suggestions;
atomic_t s_bal_cr1_5_bad_suggestions; atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */

View File

@ -154,27 +154,31 @@
* structures to decide the order in which groups are to be traversed for * structures to decide the order in which groups are to be traversed for
* fulfilling an allocation request. * fulfilling an allocation request.
* *
* At CR0 , we look for groups which have the largest_free_order >= the order * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
* of the request. We directly look at the largest free order list in the data * >= the order of the request. We directly look at the largest free order list
* structure (1) above where largest_free_order = order of the request. If that * in the data structure (1) above where largest_free_order = order of the
* list is empty, we look at remaining list in the increasing order of * request. If that list is empty, we look at remaining list in the increasing
* largest_free_order. This allows us to perform CR0 lookup in O(1) time. * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
* lookup in O(1) time.
* *
* At CR1, we only consider groups where average fragment size > request * At CR_GOAL_LEN_FAST, we only consider groups where
* size. So, we lookup a group which has average fragment size just above or * average fragment size > request size. So, we lookup a group which has average
* equal to request size using our average fragment size group lists (data * fragment size just above or equal to request size using our average fragment
* structure 2) in O(1) time. * size group lists (data structure 2) in O(1) time.
* *
* At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
* in CR1. The fact that we couldn't find a group in CR1 suggests that there is * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
* no BG that has average fragment size > goal length. So before falling to the * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
* slower CR2, in CR1.5 we proactively trim goal length and then use the same * fragment size > goal length. So before falling to the slower
* fragment lists as CR1 to find a BG with a big enough average fragment size. * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
* This increases the chances of finding a suitable block group in O(1) time and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
* results * in faster allocation at the cost of reduced size of allocation. * enough average fragment size. This increases the chances of finding a
* suitable block group in O(1) time and results in faster allocation at the
* cost of reduced size of allocation.
* *
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
* linear order which requires O(N) search time for each CR0 and CR1 phase. * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
* CR_GOAL_LEN_FAST phase.
* *
* The regular allocator (using the buddy cache) supports a few tunables. * The regular allocator (using the buddy cache) supports a few tunables.
* *
@ -359,8 +363,8 @@
* - bitlock on a group (group) * - bitlock on a group (group)
* - object (inode/locality) (object) * - object (inode/locality) (object)
* - per-pa lock (pa) * - per-pa lock (pa)
* - cr0 lists lock (cr0) * - cr_power2_aligned lists lock (cr_power2_aligned)
* - cr1 tree lock (cr1) * - cr_goal_len_fast lists lock (cr_goal_len_fast)
* *
* Paths: * Paths:
* - new pa * - new pa
@ -392,7 +396,7 @@
* *
* - allocation path (ext4_mb_regular_allocator) * - allocation path (ext4_mb_regular_allocator)
* group * group
* cr0/cr1 * cr_power2_aligned/cr_goal_len_fast
*/ */
static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_ac_cachep;
@ -866,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* Choose next group by traversing largest_free_order lists. Updates *new_cr if * Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update. * cr level needs an update.
*/ */
static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{ {
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
@ -876,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (ac->ac_status == AC_STATUS_FOUND) if (ac->ac_status == AC_STATUS_FOUND)
return; return;
if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
atomic_inc(&sbi->s_bal_cr0_bad_suggestions); atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
grp = NULL; grp = NULL;
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@ -892,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
bb_largest_free_order_node) { bb_largest_free_order_node) {
if (sbi->s_mb_stats) if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
grp = iter; grp = iter;
break; break;
} }
@ -905,10 +909,10 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (!grp) { if (!grp) {
/* Increment cr and search again */ /* Increment cr and search again */
*new_cr = CR1; *new_cr = CR_GOAL_LEN_FAST;
} else { } else {
*group = grp->bb_group; *group = grp->bb_group;
ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
} }
} }
@ -947,16 +951,16 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o
* Choose next group by traversing average fragment size list of suitable * Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update. * order. Updates *new_cr if cr level needs an update.
*/ */
static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{ {
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL; struct ext4_group_info *grp = NULL;
int i; int i;
if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
if (sbi->s_mb_stats) if (sbi->s_mb_stats)
atomic_inc(&sbi->s_bal_cr1_bad_suggestions); atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
} }
for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
@ -968,22 +972,22 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
if (grp) { if (grp) {
*group = grp->bb_group; *group = grp->bb_group;
ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
} else { } else {
*new_cr = CR1_5; *new_cr = CR_BEST_AVAIL_LEN;
} }
} }
/* /*
* We couldn't find a group in CR1 so try to find the highest free fragment * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
* order we have and proactively trim the goal request length to that order to * order we have and proactively trim the goal request length to that order to
* find a suitable group faster. * find a suitable group faster.
* *
* This optimizes allocation speed at the cost of slightly reduced * This optimizes allocation speed at the cost of slightly reduced
* preallocations. However, we make sure that we don't trim the request too * preallocations. However, we make sure that we don't trim the request too
* much and fall to CR2 in that case. * much and fall to CR_GOAL_LEN_SLOW in that case.
*/ */
static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{ {
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
@ -991,9 +995,9 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac,
int i, order, min_order; int i, order, min_order;
unsigned long num_stripe_clusters = 0; unsigned long num_stripe_clusters = 0;
if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
if (sbi->s_mb_stats) if (sbi->s_mb_stats)
atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
} }
/* /*
@ -1003,7 +1007,7 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac,
* goal length. * goal length.
*/ */
order = fls(ac->ac_g_ex.fe_len); order = fls(ac->ac_g_ex.fe_len);
min_order = order - sbi->s_mb_cr1_5_max_trim_order; min_order = order - sbi->s_mb_best_avail_max_trim_order;
if (min_order < 0) if (min_order < 0)
min_order = 0; min_order = 0;
@ -1051,11 +1055,11 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac,
if (grp) { if (grp) {
*group = grp->bb_group; *group = grp->bb_group;
ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
} else { } else {
/* Reset goal length to original goal length before falling into CR2 */ /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
*new_cr = CR2; *new_cr = CR_GOAL_LEN_SLOW;
} }
} }
@ -1063,7 +1067,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
{ {
if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
return 0; return 0;
if (ac->ac_criteria >= CR2) if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0; return 0;
if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0; return 0;
@ -1117,12 +1121,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
return; return;
} }
if (*new_cr == CR0) { if (*new_cr == CR_POWER2_ALIGNED) {
ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
} else if (*new_cr == CR1) { } else if (*new_cr == CR_GOAL_LEN_FAST) {
ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
} else if (*new_cr == CR1_5) { } else if (*new_cr == CR_BEST_AVAIL_LEN) {
ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
} else { } else {
/* /*
* TODO: For CR=2, we can arrange groups in an rb tree sorted by * TODO: For CR=2, we can arrange groups in an rb tree sorted by
@ -2444,11 +2448,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break; break;
} }
if (ac->ac_criteria < CR2) { if (ac->ac_criteria < CR_FAST) {
/* /*
* In CR1 and CR1_5, we are sure that this group will * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
* have a large enough continuous free extent, so skip * sure that this group will have a large enough
* over the smaller free extents * continuous free extent, so skip over the smaller free
* extents
*/ */
j = mb_find_next_bit(bitmap, j = mb_find_next_bit(bitmap,
EXT4_CLUSTERS_PER_GROUP(sb), i); EXT4_CLUSTERS_PER_GROUP(sb), i);
@ -2544,7 +2549,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
return false; return false;
@ -2558,7 +2563,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false; return false;
switch (cr) { switch (cr) {
case CR0: case CR_POWER2_ALIGNED:
BUG_ON(ac->ac_2order == 0); BUG_ON(ac->ac_2order == 0);
/* Avoid using the first bg of a flexgroup for data files */ /* Avoid using the first bg of a flexgroup for data files */
@ -2577,16 +2582,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false; return false;
return true; return true;
case CR1: case CR_GOAL_LEN_FAST:
case CR1_5: case CR_BEST_AVAIL_LEN:
if ((free / fragments) >= ac->ac_g_ex.fe_len) if ((free / fragments) >= ac->ac_g_ex.fe_len)
return true; return true;
break; break;
case CR2: case CR_GOAL_LEN_SLOW:
if (free >= ac->ac_g_ex.fe_len) if (free >= ac->ac_g_ex.fe_len)
return true; return true;
break; break;
case CR3: case CR_ANY_FREE:
return true; return true;
default: default:
BUG(); BUG();
@ -2627,7 +2632,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
free = grp->bb_free; free = grp->bb_free;
if (free == 0) if (free == 0)
goto out; goto out;
if (cr <= CR2 && free < ac->ac_g_ex.fe_len) if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
goto out; goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out; goto out;
@ -2642,15 +2647,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_get_group_desc(sb, group, NULL); ext4_get_group_desc(sb, group, NULL);
int ret; int ret;
/* cr=CR0/CR1 is a very optimistic search to find large /*
* good chunks almost for free. If buddy data is not * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
* ready, then this optimization makes no sense. But * search to find large good chunks almost for free. If buddy
* we never skip the first block group in a flex_bg, * data is not ready, then this optimization makes no sense. But
* since this gets used for metadata block allocation, * we never skip the first block group in a flex_bg, since this
* and we want to make sure we locate metadata blocks * gets used for metadata block allocation, and we want to make
* in the first block group in the flex_bg if possible. * sure we locate metadata blocks in the first block group in
* the flex_bg if possible.
*/ */
if (cr < CR2 && if (cr < CR_FAST &&
(!sbi->s_log_groups_per_flex || (!sbi->s_log_groups_per_flex ||
((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
!(ext4_has_group_desc_csum(sb) && !(ext4_has_group_desc_csum(sb) &&
@ -2810,10 +2816,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
} }
/* Let's just scan groups to find more-less suitable blocks */ /* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? CR0 : CR1; cr = ac->ac_2order ? CR_POWER2_ALIGNED : CR_GOAL_LEN_FAST;
/* /*
* cr == CR0 try to get exact allocation, * cr == CR_POWER2_ALIGNED try to get exact allocation,
* cr == CR3 try to get anything * cr == CR_ANY_FREE try to get anything
*/ */
repeat: repeat:
for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
@ -2843,7 +2849,7 @@ repeat:
* spend a lot of time loading imperfect groups * spend a lot of time loading imperfect groups
*/ */
if ((prefetch_grp == group) && if ((prefetch_grp == group) &&
(cr > CR1_5 || (cr >= CR_FAST ||
prefetch_ios < sbi->s_mb_prefetch_limit)) { prefetch_ios < sbi->s_mb_prefetch_limit)) {
nr = sbi->s_mb_prefetch; nr = sbi->s_mb_prefetch;
if (ext4_has_feature_flex_bg(sb)) { if (ext4_has_feature_flex_bg(sb)) {
@ -2881,9 +2887,11 @@ repeat:
} }
ac->ac_groups_scanned++; ac->ac_groups_scanned++;
if (cr == CR0) if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b); ext4_mb_simple_scan_group(ac, &e4b);
else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && else if ((cr == CR_GOAL_LEN_FAST ||
cr == CR_BEST_AVAIL_LEN) &&
sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % !(ac->ac_g_ex.fe_len %
EXT4_B2C(sbi, sbi->s_stripe))) EXT4_B2C(sbi, sbi->s_stripe)))
ext4_mb_scan_aligned(ac, &e4b); ext4_mb_scan_aligned(ac, &e4b);
@ -2900,9 +2908,9 @@ repeat:
if (sbi->s_mb_stats && i == ngroups) if (sbi->s_mb_stats && i == ngroups)
atomic64_inc(&sbi->s_bal_cX_failed[cr]); atomic64_inc(&sbi->s_bal_cX_failed[cr]);
if (i == ngroups && ac->ac_criteria == CR1_5) if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
/* Reset goal length to original goal length before /* Reset goal length to original goal length before
* falling into CR2 */ * falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
} }
@ -2929,7 +2937,7 @@ repeat:
ac->ac_b_ex.fe_len = 0; ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE; ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST; ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = CR3; cr = CR_ANY_FREE;
goto repeat; goto repeat;
} }
} }
@ -3045,66 +3053,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_puts(seq, "mballoc:\n"); seq_puts(seq, "mballoc:\n");
if (!sbi->s_mb_stats) { if (!sbi->s_mb_stats) {
seq_puts(seq, "\tmb stats collection turned off.\n"); seq_puts(seq, "\tmb stats collection turned off.\n");
seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); seq_puts(
seq,
"\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
return 0; return 0;
} }
seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); seq_printf(seq, "\tgroups_scanned: %u\n",
atomic_read(&sbi->s_bal_groups_scanned));
seq_puts(seq, "\tcr0_stats:\n"); /* CR_POWER2_ALIGNED stats */
seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_puts(seq, "\tcr_p2_aligned_stats:\n");
seq_printf(seq, "\t\tgroups_considered: %llu\n", seq_printf(seq, "\t\thits: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR0])); seq_printf(
seq, "\t\tgroups_considered: %llu\n",
atomic64_read(
&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n", seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR0])); atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tbad_suggestions: %u\n", seq_printf(seq, "\t\tbad_suggestions: %u\n",
atomic_read(&sbi->s_bal_cr0_bad_suggestions)); atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
seq_puts(seq, "\tcr1_stats:\n"); /* CR_GOAL_LEN_FAST stats */
seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_puts(seq, "\tcr_goal_fast_stats:\n");
seq_printf(seq, "\t\thits: %llu\n",
atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tgroups_considered: %llu\n", seq_printf(seq, "\t\tgroups_considered: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); atomic64_read(
seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1])); &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n", seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR1])); atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tbad_suggestions: %u\n", seq_printf(seq, "\t\tbad_suggestions: %u\n",
atomic_read(&sbi->s_bal_cr1_bad_suggestions)); atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
seq_puts(seq, "\tcr1.5_stats:\n"); /* CR_BEST_AVAIL_LEN stats */
seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); seq_puts(seq, "\tcr_best_avail_stats:\n");
seq_printf(seq, "\t\tgroups_considered: %llu\n", seq_printf(seq, "\t\thits: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); seq_printf(
seq, "\t\tgroups_considered: %llu\n",
atomic64_read(
&sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n", seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tbad_suggestions: %u\n", seq_printf(seq, "\t\tbad_suggestions: %u\n",
atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
seq_puts(seq, "\tcr2_stats:\n"); /* CR_GOAL_LEN_SLOW stats */
seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_puts(seq, "\tcr_goal_slow_stats:\n");
seq_printf(seq, "\t\thits: %llu\n",
atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\tgroups_considered: %llu\n", seq_printf(seq, "\t\tgroups_considered: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); atomic64_read(
seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR2])); &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\tuseless_loops: %llu\n", seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR2])); atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
seq_puts(seq, "\tcr3_stats:\n"); /* CR_ANY_FREE stats */
seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_puts(seq, "\tcr_any_free_stats:\n");
seq_printf(seq, "\t\tgroups_considered: %llu\n", seq_printf(seq, "\t\thits: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR3])); seq_printf(
seq, "\t\tgroups_considered: %llu\n",
atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
seq_printf(seq, "\t\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
seq_printf(seq, "\t\tuseless_loops: %llu\n", seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR3])); atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
/* Aggregates */
seq_printf(seq, "\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n",
atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
seq_printf(seq, "\tbuddies_generated: %u/%u\n", seq_printf(seq, "\tbuddies_generated: %u/%u\n",
atomic_read(&sbi->s_mb_buddies_generated), atomic_read(&sbi->s_mb_buddies_generated),
ext4_get_groups_count(sb)); ext4_get_groups_count(sb));
@ -3112,8 +3148,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic64_read(&sbi->s_mb_generation_time)); atomic64_read(&sbi->s_mb_generation_time));
seq_printf(seq, "\tpreallocated: %u\n", seq_printf(seq, "\tpreallocated: %u\n",
atomic_read(&sbi->s_mb_preallocated)); atomic_read(&sbi->s_mb_preallocated));
seq_printf(seq, "\tdiscarded: %u\n", seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
atomic_read(&sbi->s_mb_discarded));
return 0; return 0;
} }
@ -3600,7 +3635,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
/* /*
* The default group preallocation is 512, which for 4k block * The default group preallocation is 512, which for 4k block

View File

@ -86,11 +86,11 @@
#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
/* /*
* The maximum order upto which CR1.5 can trim a particular allocation request. * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
* Example, if we have an order 7 request and max trim order of 3, CR1.5 can * allocation request. Example, if we have an order 7 request and max trim order
* trim this upto order 4. * of 3, we can trim this request upto order 4.
*/ */
#define MB_DEFAULT_CR1_5_TRIM_ORDER 3 #define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3
/* /*
* Number of valid buddy orders * Number of valid buddy orders

View File

@ -223,7 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
#ifdef CONFIG_EXT4_DEBUG #ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif #endif
@ -274,7 +274,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(warning_ratelimit_burst),
ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(mb_cr1_5_max_trim_order), ATTR_LIST(mb_best_avail_max_trim_order),
ATTR_LIST(errors_count), ATTR_LIST(errors_count),
ATTR_LIST(warning_count), ATTR_LIST(warning_count),
ATTR_LIST(msg_count), ATTR_LIST(msg_count),

View File

@ -120,19 +120,19 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
{ EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \
{ EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"})
TRACE_DEFINE_ENUM(CR0); TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR1); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
TRACE_DEFINE_ENUM(CR1_5); TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN);
TRACE_DEFINE_ENUM(CR2); TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW);
TRACE_DEFINE_ENUM(CR3); TRACE_DEFINE_ENUM(CR_ANY_FREE);
#define show_criteria(cr) \ #define show_criteria(cr) \
__print_symbolic(cr, \ __print_symbolic(cr, \
{ CR0, "CR0" }, \ { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \
{ CR1, "CR1" }, \ { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \
{ CR1_5, "CR1.5" } \ { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \
{ CR2, "CR2" }, \ { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \
{ CR3, "CR3" }) { CR_ANY_FREE, "CR_ANY_FREE" })
TRACE_EVENT(ext4_other_inode_update_time, TRACE_EVENT(ext4_other_inode_update_time,
TP_PROTO(struct inode *inode, ino_t orig_ino), TP_PROTO(struct inode *inode, ino_t orig_ino),