From 744288c7216bf5876f271c32516dfb66dedb0448 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 6 Sep 2015 17:50:13 +0800 Subject: [PATCH 01/54] f2fs: trace in batches extent info update Rename trace_f2fs_update_extent_tree to trace_f2fs_update_extent_tree_range, then expand and enable it to trace in batches extent info updates. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 ++ include/trace/events/f2fs.h | 15 ++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 997ac86f2a1d..1cd6c6c57f78 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -409,6 +409,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!et) return false; + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); + write_lock(&et->lock); if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index a01946514b5a..6aa63d963af4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1132,17 +1132,19 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end, __entry->len) ); -TRACE_EVENT(f2fs_update_extent_tree, +TRACE_EVENT(f2fs_update_extent_tree_range, - TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr), + TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr, + unsigned int len), - TP_ARGS(inode, pgofs, blkaddr), + TP_ARGS(inode, pgofs, blkaddr, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(u32, blk) + __field(unsigned int, len) ), TP_fast_assign( @@ -1150,12 +1152,15 @@ TRACE_EVENT(f2fs_update_extent_tree, __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->blk = blkaddr; + __entry->len = len; ), - TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, blkaddr = %u", + TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " + "blkaddr = %u, len = %u", show_dev_ino(__entry), __entry->pgofs, - __entry->blk) + __entry->blk, + __entry->len) ); TRACE_EVENT(f2fs_shrink_extent_tree, From 538e17e7e96e14f76bb82dd83290a5315da70c3b Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Sun, 6 Sep 2015 08:28:46 -0400 Subject: [PATCH 02/54] f2fs: fix incorrect return statement in the function f2fs_ioc_release_volatile_write This fixes the incorrect return statement at the end of the function f2fs_ioc_release_volatile_write's body for returning zero as this is incorrect due to the function call before this return statement to the function punch_hole being able to fail and we should return this function's return fail directly in order to signal to callers of the function f2fs_ioc_release_volatile if a failure arises with this call to punch_hole fails. Signed-off-by: Nicholas Krause Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8120f8685141..b2fab9e2c09b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1437,8 +1437,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_first_block_written(inode)) return truncate_partial_data_page(inode, 0, true); - punch_hole(inode, 0, F2FS_BLKSIZE); - return 0; + return punch_hole(inode, 0, F2FS_BLKSIZE); } static int f2fs_ioc_abort_volatile_write(struct file *filp) From 25b93346a6b5542fd7de50555f8f0ddfc56d7443 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Sep 2015 11:13:04 -0700 Subject: [PATCH 03/54] f2fs: cover number of dirty node pages under node_write lock This number is referenced by checkpoint under node_write lock. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27d1a74dd6f3..4d9bedfe101c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1323,23 +1323,24 @@ static int f2fs_write_node_page(struct page *page, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + get_node_info(sbi, nid, &ni); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); unlock_page(page); return 0; } - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - set_page_writeback(page); fio.blk_addr = ni.blk_addr; write_node_page(nid, &fio); From c5cd29d21cebc2e1802a9eea4cd940ddf9825ce1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Sep 2015 11:25:30 -0700 Subject: [PATCH 04/54] f2fs: no need to lock for update_inode_page all the time As comment says, we don't need to call f2fs_lock_op in write_inode to prevent from producing dirty node pages all the time. That happens only when there is not enough free sections and we can avoid that by calling balance_fs in prior to that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 35aae65b3e5d..97e20decacb4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -296,16 +296,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; /* - * We need to lock here to prevent from producing dirty node pages + * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - f2fs_lock_op(sbi); update_inode_page(inode); - f2fs_unlock_op(sbi); - - if (wbc) - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi); return 0; } From c998012b0bb93009d66349ea7e9747c968375e69 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:39:02 +0800 Subject: [PATCH 05/54] f2fs: verify file type early in f2fs_fallocate This patch changes to verify file type early in f2fs_fallocate for cleanup, meanwhile this also fixes to add missing verification for expand_inode_data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b2fab9e2c09b..a85bb14716e5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -765,9 +765,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - if (f2fs_has_inline_data(inode)) { ret = f2fs_convert_inline_inode(inode); if (ret) @@ -908,9 +905,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (offset + len >= i_size_read(inode)) return -EINVAL; @@ -959,9 +953,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, loff_t off_start, off_end; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; @@ -1068,9 +1059,6 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) return -EFBIG; @@ -1228,6 +1216,10 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + /* f2fs only support ->fallocate for regular file */ + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (f2fs_encrypted_inode(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; From 100136acfb4023ab7dc899192e95aca9aedfe98a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:43:02 +0800 Subject: [PATCH 06/54] f2fs: fix incorrect searching position when shrinking extent cache When shrinking extent cache, we have two steps in the flow: 1) shrink objects which are unreferenced by inodes; 2) shrink objects from LRU list of extent cache. In step 1, if we haven't shrunk enough number of objects, we will try step 2, but before that we didn't update the searching position which may point to last inode index in global extent tree, result in failing to shrink objects by traversing the all inodes' extent tree. In this patch, we reset searching position to beginning of global extent tree for fixing. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cd6c6c57f78..a8b9aa29908d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -652,6 +652,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) } spin_unlock(&sbi->extent_lock); + /* + * reset ino for searching victims from beginning of global extent tree. + */ + ino = F2FS_ROOT_INO(sbi); + while ((found = radix_tree_gang_lookup(root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; From 9edcdabf36422d15d01db73b0fa5487e418beff6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 11 Sep 2015 14:43:52 +0800 Subject: [PATCH 07/54] f2fs: fix overflow of size calculation We have potential overflow issue when calculating size of object, when we left shift index with PAGE_CACHE_SHIFT bits, if type of index has only 32-bits space in 32-bit architecture, left shifting will incur overflow, i.e: pgoff_t index = 0xFFFFFFFF; loff_t size = index << PAGE_CACHE_SHIFT; size: 0xFFFFF000 So we should cast index with 64-bits type to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++----- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 18 ++++++++++-------- fs/f2fs/recovery.c | 2 +- 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a82abe921b89..681373e1da92 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -447,9 +447,9 @@ struct page *get_new_data_page(struct inode *inode, lock_page(page); } got_it: - if (new_i_size && - i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + if (new_i_size && i_size_read(inode) < + ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { + i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } @@ -489,8 +489,9 @@ static int __allocate_data_block(struct dnode_of_data *dn) /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); /* direct IO doesn't use extent cache to maximize the performance */ f2fs_drop_largest_extent(dn->inode, fofs); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d013d8479753..ebfcc4039057 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -198,9 +198,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -333,13 +333,13 @@ static int stat_show(struct seq_file *s, void *v) /* memory footprint */ update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB\n", + seq_printf(s, "\nMemory: %llu KB\n", (si->base_mem + si->cache_mem + si->page_mem) >> 10); - seq_printf(s, " - static: %u KB\n", + seq_printf(s, " - static: %llu KB\n", si->base_mem >> 10); - seq_printf(s, " - cached: %u KB\n", + seq_printf(s, " - cached: %llu KB\n", si->cache_mem >> 10); - seq_printf(s, " - paged : %u KB\n", + seq_printf(s, " - paged : %llu KB\n", si->page_mem >> 10); } mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1a90ffd7cad..79c38adf3ef2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1844,7 +1844,7 @@ struct f2fs_stat_info { unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; - unsigned base_mem, cache_mem, page_mem; + unsigned long long base_mem, cache_mem, page_mem; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a85bb14716e5..27a789c0d3c2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { + if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + i_size_read(inode)) { unsigned offset; offset = i_size_read(inode) & ~PAGE_CACHE_MASK; zero_user_segment(page, offset, PAGE_CACHE_SIZE); @@ -343,7 +344,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); if (err && err != -ENOENT) { @@ -802,8 +803,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi); - blk_start = pg_start << PAGE_CACHE_SHIFT; - blk_end = pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -994,7 +995,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, return ret; new_size = max_t(loff_t, new_size, - pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_CACHE_SHIFT); } for (index = pg_start; index < pg_end; index++) { @@ -1030,7 +1031,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_unlock_op(sbi); new_size = max_t(loff_t, new_size, - (index + 1) << PAGE_CACHE_SHIFT); + (loff_t)(index + 1) << PAGE_CACHE_SHIFT); } if (off_end) { @@ -1192,9 +1193,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) - new_size = (index + 1) << PAGE_CACHE_SHIFT; + new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; else if (index == pg_end) - new_size = (index << PAGE_CACHE_SHIFT) + off_end; + new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + + off_end; else new_size += PAGE_CACHE_SIZE; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index faec2ca004b9..acc21f20637b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -570,7 +570,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); From 514053e4543b33bbd74d8900f368df663cb03325 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 15 Sep 2015 14:46:43 -0700 Subject: [PATCH 08/54] f2fs: declare f2fs_update_extent_tree_range as static This function should be static. Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index a8b9aa29908d..63068b75b1bd 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -394,7 +394,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, return en; } -unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static unsigned int f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); From 973163fc0ce07761eae671e895dc6747a3410fe7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:51:51 +0800 Subject: [PATCH 09/54] f2fs: reorganize f2fs_map_blocks In this patch, we try to reorganize f2fs_map_blocks to make block mapping flow more clear by using following structure: /* check status of mapping */ if (unmapped) { /* blkaddr == NULL_ADDR || blkaddr == NEW_ADDR */ if (create) { /* write path, handle dio write case here */ alloc_and_map; } else { /* * handle read cases from all call paths: * 1. generic read; * 2. dio read; * 3. fiemap; * 4. bmap */ } } /* map buffer_header */ Besides, this patch handles the missing case correctly for dio write: When we fail in __allocate_data_blocks, then in f2fs_map_blocks, we will not allocate blocks correctly for preallocated blocks, but returning with an unmapped buffer head, which will result in failure of dio write. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 84 ++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 681373e1da92..883b6499841f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -596,40 +596,36 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = 0; goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) { - err = -ENOENT; - goto put_out; - } else if (flag == F2FS_GET_BLOCK_READ || - flag == F2FS_GET_BLOCK_DIO) { - goto put_out; + + if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map->m_flags = F2FS_MAP_NEW; + } else { + if (flag != F2FS_GET_BLOCK_FIEMAP || + dn.data_blkaddr != NEW_ADDR) { + if (flag == F2FS_GET_BLOCK_BMAP) + err = -ENOENT; + goto put_out; + } + + /* + * preallocated unwritten block should be mapped + * for fiemap. + */ + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags = F2FS_MAP_UNWRITTEN; } - /* - * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP), - * mark it as mapped and unwritten block. - */ } - if (dn.data_blkaddr != NULL_ADDR) { - map->m_flags = F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags |= F2FS_MAP_UNWRITTEN; - } else if (create) { - err = __allocate_data_block(&dn); - if (err) - goto put_out; - allocated = true; - map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - } else { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; - } + map->m_flags |= F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + map->m_len = 1; end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - map->m_len = 1; dn.ofs_in_node++; pgofs++; @@ -648,23 +644,31 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR && - flag != F2FS_GET_BLOCK_FIEMAP) - goto put_out; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } if (maxblocks > map->m_len) { block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR && create) { - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; + } } + /* Give more consecutive addresses for the readahead */ if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || From 569cf1876a32e574ba8a7fb825cd91bafd003882 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Sep 2015 13:38:23 -0700 Subject: [PATCH 10/54] f2fs crypto: allocate buffer for decrypting filename We got dentry pages from high_mem, and its address space directly goes into the decryption path via f2fs_fname_disk_to_usr. But, sg_init_one assumes the address is not from high_mem, so we can get this panic since it doesn't call kmap_high but kunmap_high is triggered at the end. kernel BUG at ../../../../../../kernel/mm/highmem.c:290! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM ... (kunmap_high+0xb0/0xb8) from [] (__kunmap_atomic+0xa0/0xa4) (__kunmap_atomic+0xa0/0xa4) from [] (blkcipher_walk_done+0x128/0x1ec) (blkcipher_walk_done+0x128/0x1ec) from [] (crypto_cbc_decrypt+0xc0/0x170) (crypto_cbc_decrypt+0xc0/0x170) from [] (crypto_cts_decrypt+0xc0/0x114) (crypto_cts_decrypt+0xc0/0x114) from [] (async_decrypt+0x40/0x48) (async_decrypt+0x40/0x48) from [] (f2fs_fname_disk_to_usr+0x124/0x304) (f2fs_fname_disk_to_usr+0x124/0x304) from [] (f2fs_fill_dentries+0xac/0x188) (f2fs_fill_dentries+0xac/0x188) from [] (f2fs_readdir+0x1f0/0x300) (f2fs_readdir+0x1f0/0x300) from [] (vfs_readdir+0x90/0xb4) (vfs_readdir+0x90/0xb4) from [] (SyS_getdents64+0x64/0xcc) (SyS_getdents64+0x64/0xcc) from [] (ret_fast_syscall+0x0/0x30) Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 13 ++++++++++--- fs/f2fs/namei.c | 10 +++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8f15fc134040..6726c4a5efa2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -787,7 +787,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, else d_type = DT_UNKNOWN; - /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -795,12 +794,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; + de_name.name = kmalloc(de_name.len, GFP_NOFS); + if (!de_name.name) + return false; + + memcpy(de_name.name, d->filename[bit_pos], de_name.len); + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, &de_name, fstr); - de_name = *fstr; - fstr->len = save_len; + kfree(de_name.name); if (ret < 0) return true; + + de_name = *fstr; + fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a680bf38e4f0..dfa01c88b34b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -947,8 +947,13 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; - cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); + cstr.name = kmalloc(cstr.len, GFP_NOFS); + if (!cstr.name) { + res = -ENOMEM; + goto errout; + } + memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ if (cstr.name[0] == 0 && cstr.len == 0) { @@ -970,6 +975,8 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook if (res < 0) goto errout; + kfree(cstr.name); + paddr = pstr.name; /* Null-terminate the name */ @@ -979,6 +986,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook page_cache_release(cpage); return *cookie = paddr; errout: + kfree(cstr.name); f2fs_fname_crypto_free_buffer(&pstr); kunmap(cpage); page_cache_release(cpage); From a7230d16d5f8635dbc029af057969e46e3eb2246 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Sep 2015 14:06:54 -0700 Subject: [PATCH 11/54] f2fs: check end_io for metapages before making next checkpoint blocks This patch avoids to produce new checkpoint blocks before the previous meta pages were written completely. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c5a38e352a80..ff53405aee39 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1000,6 +1000,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); + /* need to wait for end_io results */ + wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); From 41a099de3a8d2b570147445c5cddc59c0daf5e96 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 17 Sep 2015 18:24:17 +0800 Subject: [PATCH 12/54] f2fs: drop largest extent by range now we update extent by range, fofs may not be on the largest extent if the new extent overlaps with it. so add a new function to drop largest extent properly. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 63068b75b1bd..2f013e22b7bb 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -155,11 +155,12 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - et->count; } -static void __drop_largest_extent(struct inode *inode, pgoff_t fofs) +static void __drop_largest_extent(struct inode *inode, + pgoff_t fofs, unsigned int len) { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (largest->fofs <= fofs && largest->fofs + largest->len > fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) largest->len = 0; } @@ -168,7 +169,7 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) if (!f2fs_may_extent_tree(inode)) return; - __drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs, 1); } void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) @@ -422,7 +423,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, dei.len = 0; /* we do not guarantee that the largest extent is cached all the time */ - __drop_largest_extent(inode, fofs); + __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, From 4d1fa815f26ce92dc4710d28f50071174a2340b7 Mon Sep 17 00:00:00 2001 From: Fan Li Date: Thu, 17 Sep 2015 18:42:06 +0800 Subject: [PATCH 13/54] f2fs: optimize code of f2fs_update_extent_tree_range Fix 2 potential problems: 1. when largest extent needs to be invalidated, it will be reset in __drop_largest_extent, which makes __is_extent_same after always return false, and largest extent unchanged. Now we update it properly. 2. when extent is split and the latter part remains in tree, next_en should be the latter part instead of next extent of original extent. It will cause merge failure if there is in-place update, although there is not, I think this fix will still makes codes less ambiguous. This patch also simplifies codes of invalidating extents, and optimizes the procedues that split extent into two. There are a few modifications after last patch: 1. prev_en now is updated properly. 2. more codes and branches are simplified. Signed-off-by: Fan li Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 183 +++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 115 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2f013e22b7bb..c9d1cfd13b58 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -400,7 +400,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; - struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *en = NULL, *en1 = NULL; struct extent_node *prev_en = NULL, *next_en = NULL; struct extent_info ei, dei, prev; struct rb_node **insert_p = NULL, *insert_parent = NULL; @@ -422,148 +422,101 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, prev = et->largest; dei.len = 0; - /* we do not guarantee that the largest extent is cached all the time */ + /* + * drop largest extent before lookup, in case it's already + * been shrunk from extent tree + */ __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, &insert_p, &insert_parent); - if (!en) { - if (next_en) { - en = next_en; - f2fs_bug_on(sbi, en->ei.fofs <= pos); - pos = en->ei.fofs; - } else { - /* - * skip searching in the tree since there is no - * larger extent node in the cache. - */ - goto update_extent; - } - } + if (!en) + en = next_en; /* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */ - while (en) { - struct rb_node *node; + while (en && en->ei.fofs < end) { + unsigned int org_end; + int parts = 0; /* # of parts current extent split into */ - if (pos >= end) - break; + next_en = en1 = NULL; dei = en->ei; - en1 = en2 = NULL; + org_end = dei.fofs + dei.len; + f2fs_bug_on(sbi, pos >= org_end); - node = rb_next(&en->rb_node); - - /* - * 2.1 there are four cases when we invalidate blkaddr in extent - * node, |V: valid address, X: will be invalidated| - */ - /* case#1, invalidate right part of extent node |VVVVVXXXXX| */ - if (pos > dei.fofs && end >= dei.fofs + dei.len) { - en->ei.len = pos - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; - } - - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + en->ei.len = pos - en->ei.fofs; + prev_en = en; + parts = 1; } - /* case#2, invalidate left part of extent node |XXXXXVVVVV| */ - if (pos <= dei.fofs && end < dei.fofs + dei.len) { - en->ei.fofs = end; - en->ei.blk += end - dei.fofs; - en->ei.len -= end - dei.fofs; - - if (en->ei.len < F2FS_MIN_EXTENT_LEN) { - __detach_extent_node(sbi, et, en); - insert_p = NULL; - insert_parent = NULL; - goto update; - } - - if (__is_extent_same(&dei, &et->largest)) - et->largest = en->ei; - goto next; - } - - __detach_extent_node(sbi, et, en); - - /* - * if we remove node in rb-tree, our parent node pointer may - * point the wrong place, discard them. - */ - insert_p = NULL; - insert_parent = NULL; - - /* case#3, invalidate entire extent node |XXXXXXXXXX| */ - if (pos <= dei.fofs && end >= dei.fofs + dei.len) { - if (__is_extent_same(&dei, &et->largest)) - et->largest.len = 0; - goto update; - } - - /* - * case#4, invalidate data in the middle of extent node - * |VVVXXXXVVV| - */ - if (dei.len > F2FS_MIN_EXTENT_LEN) { - unsigned int endofs; - - /* insert left part of split extent into cache */ - if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { - set_extent_info(&ei, dei.fofs, dei.blk, - pos - dei.fofs); - en1 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); - } - - /* insert right part of split extent into cache */ - endofs = dei.fofs + dei.len; - if (endofs - end >= F2FS_MIN_EXTENT_LEN) { + if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) { + if (parts) { set_extent_info(&ei, end, end - dei.fofs + dei.blk, - endofs - end); - en2 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); + org_end - end); + en1 = __insert_extent_tree(sbi, et, &ei, + NULL, NULL); + next_en = en1; + } else { + en->ei.fofs = end; + en->ei.blk += end - dei.fofs; + en->ei.len -= end - dei.fofs; + next_en = en; } + parts++; } -update: - /* 2.2 update in global extent list */ + + if (!next_en) { + struct rb_node *node = rb_next(&en->rb_node); + + next_en = node ? + rb_entry(node, struct extent_node, rb_node) + : NULL; + } + + if (parts) { + if (en->ei.len > et->largest.len) + et->largest = en->ei; + } else { + __detach_extent_node(sbi, et, en); + } + + /* + * if original extent is split into zero or two parts, extent + * tree has been altered by deletion or insertion, therefore + * invalidate pointers regard to tree. + */ + if (parts != 1) { + insert_p = NULL; + insert_parent = NULL; + } + + /* update in global extent list */ spin_lock(&sbi->extent_lock); - if (en && !list_empty(&en->list)) + if (!parts && !list_empty(&en->list)) list_del(&en->list); if (en1) list_add_tail(&en1->list, &sbi->extent_list); - if (en2) - list_add_tail(&en2->list, &sbi->extent_list); spin_unlock(&sbi->extent_lock); - /* 2.3 release extent node */ - if (en) + /* release extent node */ + if (!parts) kmem_cache_free(extent_node_slab, en); -next: - en = node ? rb_entry(node, struct extent_node, rb_node) : NULL; - next_en = en; - if (en) - pos = en->ei.fofs; + + en = next_en; } -update_extent: /* 3. update extent in extent cache */ if (blkaddr) { struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en3 = __try_merge_extent_node(sbi, et, &ei, &den, + en1 = __try_merge_extent_node(sbi, et, &ei, &den, prev_en, next_en); - if (!en3) - en3 = __insert_extent_tree(sbi, et, &ei, + if (!en1) + en1 = __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ @@ -575,11 +528,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } spin_lock(&sbi->extent_lock); - if (en3) { - if (list_empty(&en3->list)) - list_add_tail(&en3->list, &sbi->extent_list); + if (en1) { + if (list_empty(&en1->list)) + list_add_tail(&en1->list, &sbi->extent_list); else - list_move_tail(&en3->list, &sbi->extent_list); + list_move_tail(&en1->list, &sbi->extent_list); } if (den && !list_empty(&den->list)) list_del(&den->list); From ea58711e884c3006e6f61e44d47f0c2d9f0979e1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Sep 2015 20:22:44 +0800 Subject: [PATCH 14/54] f2fs: do in batches truncation in truncate_hole truncate_data_blocks_range can do in batches truncation which makes all changes in dnode page content, dnode page status, extent cache, block count updating together. But previously, truncate_hole() always truncates one block in dnode page at a time by invoking truncate_data_blocks_range(,1), which make thing slow. This patch changes truncate_hole() to do in batches truncation for all target blocks in one direct node inside truncate_data_blocks_range, which can make our punch hole operation in ->fallocate more efficent. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 27a789c0d3c2..6702157801a6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -739,23 +739,31 @@ static int fill_zero(struct inode *inode, pgoff_t index, int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) { - pgoff_t index; int err; - for (index = pg_start; index < pg_end; index++) { + while (pg_start < pg_end) { struct dnode_of_data dn; + pgoff_t end_offset, count; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { - if (err == -ENOENT) + if (err == -ENOENT) { + pg_start++; continue; + } return err; } - if (dn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&dn, 1); + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); + + f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); + + truncate_data_blocks_range(&dn, count); f2fs_put_dnode(&dn); + + pg_start += count; } return 0; } From f9811703fefbdb22abf723ec61368777cdad8508 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Sep 2015 20:17:52 +0800 Subject: [PATCH 15/54] f2fs: fix to handle io error in ->direct_IO Here is a oops reported as following message when testing generic/019 of xfstest: ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/f2fs-dev/segment.c:882! invalid opcode: 0000 [#1] SMP Modules linked in: zram lz4_compress lz4_decompress f2fs(O) ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 nf_def CPU: 2 PID: 25441 Comm: fio Tainted: G O 4.3.0-rc1+ #6 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.61 05/16/2013 task: ffff8803f4e85580 ti: ffff8803fd61c000 task.ti: ffff8803fd61c000 RIP: 0010:[] [] new_curseg+0x321/0x330 [f2fs] RSP: 0018:ffff8803fd61f918 EFLAGS: 00010246 RAX: 00000000000007ed RBX: 0000000000000224 RCX: 000000000000001f RDX: 0000000000000800 RSI: ffffffffffffffff RDI: ffff8803f56f4300 RBP: ffff8803fd61f978 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000024 R11: ffff8800d23bbd78 R12: ffff8800d0ef0000 R13: 0000000000000224 R14: 0000000000000000 R15: 0000000000000001 FS: 00007f827ff85700(0000) GS:ffff88041ea80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffff600000 CR3: 00000003fef17000 CR4: 00000000001406e0 Stack: 000007ea00000002 0000000100000001 ffff8803f6456248 000007ed0000002b 0000000000000224 ffff880404d1aa20 ffff8803fd61f9c8 ffff8800d0ef0000 ffff8803f6456248 0000000000000001 00000000ffffffff ffffffffa078f358 Call Trace: [] allocate_segment_by_default+0x1a7/0x1f0 [f2fs] [] allocate_data_block+0x17c/0x360 [f2fs] [] __allocate_data_block+0x131/0x1d0 [f2fs] [] f2fs_direct_IO+0x4b5/0x580 [f2fs] [] generic_file_direct_write+0xae/0x160 [] __generic_file_write_iter+0xd5/0x1f0 [] generic_file_write_iter+0xf7/0x200 [] ? apparmor_file_permission+0x18/0x20 [] ? f2fs_fallocate+0x1190/0x1190 [f2fs] [] f2fs_file_write_iter+0x46/0x90 [f2fs] [] aio_run_iocb+0x1ee/0x290 [] ? mutex_lock+0x1e/0x50 [] ? aio_read_events+0x207/0x2b0 [] do_io_submit+0x373/0x630 [] ? SyS_io_getevents+0x56/0xb0 [] SyS_io_submit+0x10/0x20 [] entry_SYSCALL_64_fastpath+0x12/0x6a Code: 45 c8 48 8b 78 10 e8 9f 23 bf e0 41 8b 8c 24 cc 03 00 00 89 c7 31 d2 89 c6 89 d8 29 df f7 f1 29 d1 39 cf 0f 83 be fd ff ff eb RIP [] new_curseg+0x321/0x330 [f2fs] RSP ---[ end trace 2e577d7f711ddb86 ]--- The reason is that: in the test of generic/019, we will trigger a manmade IO error in block layer through debugfs, after that, prefree segment will no longer be freed, because we always skip doing gc or checkpoint when there occurs an IO error. Meanwhile fio with aio engine generated a large number of direct IOs, which continue allocating spaces in free segment until we run out of them, eventually, results in panic in new_curseg as no more free segment was found. So, this patch changes to return EIO in direct_IO for this condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 883b6499841f..d265401b5326 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -524,6 +524,9 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; + if (unlikely(f2fs_cp_error(sbi))) + goto sync_out; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { if (__allocate_data_block(&dn)) @@ -566,6 +569,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; @@ -599,6 +603,10 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto put_out; + } err = __allocate_data_block(&dn); if (err) goto put_out; @@ -652,6 +660,10 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; + } err = __allocate_data_block(&dn); if (err) goto sync_out; @@ -1556,10 +1568,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (iov_iter_rw(iter) == WRITE) + if (iov_iter_rw(iter) == WRITE) { __allocate_data_blocks(inode, offset, count); + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + err = -EIO; + goto out; + } + } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); +out: if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); From 46c9e1413f3c4ecbbe775e545063e88feb1f9871 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:54:16 +0800 Subject: [PATCH 16/54] f2fs: use correct flag in f2fs_map_blocks() We introduce F2FS_GET_BLOCK_READ in commit e2b4e2bc8865 ("f2fs: fix incorrect mapping for bmap"), but forget to use this flag in the right place, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d265401b5326..bc04e9201fd6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -920,7 +920,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, false)) + if (f2fs_map_blocks(inode, &map, 0, + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: From 9cd81ce3c2f01fc599b9156b94b868b4f578698c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Sep 2015 16:55:26 +0800 Subject: [PATCH 17/54] f2fs: disallow switch extent_cache option dynamically Swith extent_cache option dynamically when remount may casue consistency issue between extent cache and dnode page. Fix in this patch to avoid that condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f79478115d37..16442ec89489 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -742,6 +742,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; + bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); sync_filesystem(sb); @@ -767,6 +768,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + /* disallow enable/disable extent_cache dynamically */ + if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "switch extent_cache option is not allowed"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount From 545fe4210df5eb4097aa17c68f0f153db27bcf44 Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Mon, 21 Sep 2015 18:55:49 -0400 Subject: [PATCH 18/54] f2fs: fix error handling for calls to various functions in the function recover_inline_data This fixes error handling for calls to various functions in the function recover_inline_data to check if these particular functions either return a error code or the boolean value false to signal their caller they have failed internally and if this arises return false to signal failure immediately to the caller of recover_inline_data as we cannot continue after failures to calling either the function truncate_inline_inode or truncate_blocks. Signed-off-by: Nicholas Krause Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3d143be42895..3b18e23319c3 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -274,12 +274,14 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_inode(ipage, 0); + if (!truncate_inline_inode(ipage, 0)) + return false; f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - truncate_blocks(inode, 0, false); + if (truncate_blocks(inode, 0, false)) + return false; goto process_inline; } return false; From 4abd3f5ac49314fe345566ec859b8e60993207bf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Sep 2015 21:07:47 +0800 Subject: [PATCH 19/54] f2fs: introduce __try_update_largest_extent This patch adds a new helper __try_update_largest_extent for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 14 +++++--------- fs/f2fs/f2fs.h | 7 +++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c9d1cfd13b58..a38ee9bec4ba 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -351,8 +351,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (en) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; } return en; @@ -389,8 +388,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, if (!en) return NULL; - if (en->ei.len > et->largest.len) - et->largest = en->ei; + __try_update_largest_extent(et, en); et->cached_en = en; return en; } @@ -476,12 +474,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, : NULL; } - if (parts) { - if (en->ei.len > et->largest.len) - et->largest = en->ei; - } else { + if (parts) + __try_update_largest_extent(et, en); + else __detach_extent_node(sbi, et, en); - } /* * if original extent is split into zero or two parts, extent diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 79c38adf3ef2..8d6681a33258 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -492,6 +492,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) +{ + if (en->ei.len > et->largest.len) + et->largest = en->ei; +} + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ From 1d7e10d58a1f62c8c44668cca1cff6a2dd57d7d9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 23 Sep 2015 09:25:43 +0800 Subject: [PATCH 20/54] f2fs: fix incorrect bimodal calculation In update_sit_info, we use div_u64 to handle 'u64 divide u64' case, but div_u64 can only handle 32-bits divisor, so our divisor with u64 type passed to div_u64 will overflow, result in the wrong calculation when show debug info of f2fs as below: BDF: 464, avg. vblocks: 23509 (BDF should never exceed 100) So change to use div64_u64 to handle this case correctly. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ebfcc4039057..615a307a7871 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div_u64(bimodal, dist); + si->bimodal = div64_u64(bimodal, dist); if (si->dirty_count) si->avg_vblocks = div_u64(total_vblocks, ndirty); else From 7223554133c3f72809ea6ddbf0d8464e7c70c1b1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Sep 2015 17:54:56 +0800 Subject: [PATCH 21/54] f2fs: remove unneeded f2fs_{,un}lock_op in do_recover_data() Protecting recovery flow by using cp_rwsem is not needed, since we have prevent triggering any checkpoint by locking cp_mutex previously. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index acc21f20637b..c5daec503e7f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -383,15 +383,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); + if (err) goto out; - } f2fs_wait_on_page_writeback(dn.node_page, NODE); @@ -456,7 +452,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); err: f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); out: f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, recovered = %d blocks, err = %d", From 90b803e6fb6243922bff9ddd8a6205c17cb93b31 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Sep 2015 19:34:50 -0700 Subject: [PATCH 22/54] f2fs: do not skip dentry block writes Previously, we skip dentry block writes when wbc is SYNC_NONE with no memory pressure and the number of dirty pages is pretty small. But, we didn't skip for normal data writes, which gives us not much big impact on overall performance. Moreover, by skipping some data writes, kworker falls into infinite loop to try to write blocks, when many dir inodes have only one dentry block. So, this patch removes skipping data writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ----- fs/f2fs/node.c | 5 ----- fs/f2fs/node.h | 1 - fs/f2fs/segment.h | 4 +--- 4 files changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bc04e9201fd6..a903423e4cd5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1340,11 +1340,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; - if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && - get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && - available_free_memory(sbi, DIRTY_DENTS)) - goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4d9bedfe101c..1fe49ca20757 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,11 +52,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); - } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; - mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); - res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == INO_ENTRIES) { int i; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e956ad81..51c62edf2e89 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -118,7 +118,6 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ - DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b6e4ed15c698..a294da70a7b5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -697,9 +697,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; - if (type == DATA) - return sbi->blocks_per_seg; - else if (type == NODE) + if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) return MAX_BIO_BLOCKS(sbi); From 345a6b2ee2987a11bc8e9c08ff2b68a973fd912c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Sep 2015 17:46:01 +0800 Subject: [PATCH 23/54] f2fs: fix to update {m,c}time correctly when truncating larger This patch fixes to update ctime and atime correctly when truncating larger in ->setattr. The bug is reported by xfstest generic/313 as below: generic/313 2s ... - output mismatch (see ./results/generic/313.out.bad) --- tests/generic/313.out 2015-08-04 15:28:53.430798882 +0800 +++ results/generic/313.out.bad 2015-09-28 17:04:27.294278016 +0800 @@ -1,2 +1,4 @@ QA output created by 313 Silence is golden +ctime not updated after truncate up +mtime not updated after truncate up ... (Run 'diff -u tests/generic/313.out tests/generic/313.out.bad' to see the entire diff) Ran: generic/313 Failures: generic/313 Failed 1 of 1 tests Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6702157801a6..a350c2aeccae 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -681,6 +681,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } From 45fe8492ccbe561c4b8918c2d4c83a0501e50646 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Sep 2015 17:42:24 +0800 Subject: [PATCH 24/54] f2fs: fix to correct freed section number during gc This patch fixes to maintain the right section count freed in garbage collecting when triggering a foreground gc. Besides, when a foreground gc is running on current selected section, once we fail to gc one segment, it's better to abandon gcing the left segments in current section, because anyway we will select next victim for foreground gc, so gc on the left segments in previous section will become overhead and also cause the long latency for caller. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 782b8e72c094..b6e03ebc703c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -802,7 +802,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) unsigned int segno = NULL_SEGNO; unsigned int i; int gc_type = BG_GC; - int nfree = 0; + int sec_freed = 0; int ret = -1; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -817,7 +817,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) if (unlikely(f2fs_cp_error(sbi))) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) write_checkpoint(sbi, &cpc); @@ -832,13 +832,23 @@ int f2fs_gc(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, META_SSA); - for (i = 0; i < sbi->segs_per_sec; i++) - nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * for FG_GC case, halt gcing left segments once failed one + * of segments in selected section to avoid long latency. + */ + if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && + gc_type == FG_GC) + break; + } + + if (i == sbi->segs_per_sec && gc_type == FG_GC) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, nfree)) + if (has_not_enough_free_secs(sbi, sec_freed)) goto gc_more; if (gc_type == FG_GC) From ab126cfc3090fa5af2a87cc0698f793aebbec7d0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 18 Sep 2015 17:33:00 -0700 Subject: [PATCH 25/54] f2fs: should get a victim from retrials If we do not call get_victim first, we cannot get a new victim for retrial path. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b6e03ebc703c..343b096cb654 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -799,8 +799,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno = NULL_SEGNO; - unsigned int i; + unsigned int segno, i; int gc_type = BG_GC; int sec_freed = 0; int ret = -1; @@ -812,6 +811,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi) cpc.reason = __get_cp_reason(sbi); gc_more: + segno = NULL_SEGNO; + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (unlikely(f2fs_cp_error(sbi))) From 39307a8e2459ecdee9f1bc0b8a5d7af4a6d8f754 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Sep 2015 13:50:47 -0700 Subject: [PATCH 26/54] f2fs: use vmalloc to handle -ENOMEM error This patch introduces f2fs_kvmalloc to avoid -ENOMEM during mount. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 21 +++++++++++++++++++++ fs/f2fs/segment.c | 32 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8d6681a33258..6f27b921b027 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #ifdef CONFIG_F2FS_CHECK_FS @@ -1586,6 +1587,26 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) return S_ISREG(mode); } +static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags | __GFP_NOWARN); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 78e6d0696847..0ceff2891625 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "f2fs.h" @@ -1955,12 +1954,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -1982,8 +1982,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * - sizeof(struct sec_entry)); + sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } @@ -2028,12 +2028,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2174,7 +2174,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2196,7 +2196,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2301,7 +2301,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); - kfree(dirty_i->dirty_segmap[dirty_type]); + kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } @@ -2309,7 +2309,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - kfree(dirty_i->victim_secmap); + kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -2348,8 +2348,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) if (!free_i) return; SM_I(sbi)->free_info = NULL; - kfree(free_i->free_segmap); - kfree(free_i->free_secmap); + kvfree(free_i->free_segmap); + kvfree(free_i->free_secmap); kfree(free_i); } @@ -2370,9 +2370,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) } kfree(sit_i->tmp_map); - vfree(sit_i->sentries); - vfree(sit_i->sec_entries); - kfree(sit_i->dirty_sentries_bitmap); + kvfree(sit_i->sentries); + kvfree(sit_i->sec_entries); + kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); From 5b7ee374144f8ef2db3e25d0d59a8ad83bb3cf33 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Sep 2015 17:38:48 +0800 Subject: [PATCH 27/54] f2fs: use atomic64_t for extent cache hit stat Our hit stat of extent cache will increase all the time until remount, and we use atomic_t type for the stat variable, so it may easily incur overflow when we query extent cache frequently in a long time running fs. So to avoid that, this patch uses atomic64_t for hit stat variables. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 22 +++++++++++----------- fs/f2fs/f2fs.h | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 615a307a7871..478e5d54154f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -33,11 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi) int i; /* validation check of the segment numbers */ - si->hit_largest = atomic_read(&sbi->read_hit_largest); - si->hit_cached = atomic_read(&sbi->read_hit_cached); - si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree); + si->hit_largest = atomic64_read(&sbi->read_hit_largest); + si->hit_cached = atomic64_read(&sbi->read_hit_cached); + si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; - si->total_ext = atomic_read(&sbi->total_hit_ext); + si->total_ext = atomic64_read(&sbi->total_hit_ext); si->ext_tree = sbi->total_ext_tree; si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -283,12 +283,12 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); seq_puts(s, "\nExtent Cache:\n"); - seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n", + seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, si->hit_rbtree); - seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n", + seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n", !si->total_ext ? 0 : - (si->hit_total * 100) / si->total_ext, + div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", si->ext_tree, si->ext_node); @@ -378,10 +378,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; - atomic_set(&sbi->total_hit_ext, 0); - atomic_set(&sbi->read_hit_rbtree, 0); - atomic_set(&sbi->read_hit_largest, 0); - atomic_set(&sbi->read_hit_cached, 0); + atomic64_set(&sbi->total_hit_ext, 0); + atomic64_set(&sbi->read_hit_rbtree, 0); + atomic64_set(&sbi->read_hit_largest, 0); + atomic64_set(&sbi->read_hit_cached, 0); atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f27b921b027..1fa0555d1c2c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -795,10 +795,10 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ - atomic_t total_hit_ext; /* # of lookup extent cache */ - atomic_t read_hit_rbtree; /* # of hit rbtree extent node */ - atomic_t read_hit_largest; /* # of hit largest extent node */ - atomic_t read_hit_cached; /* # of hit cached extent node */ + atomic64_t total_hit_ext; /* # of lookup extent cache */ + atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */ + atomic64_t read_hit_largest; /* # of hit largest extent node */ + atomic64_t read_hit_cached; /* # of hit cached extent node */ atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ @@ -1848,7 +1848,8 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext; + unsigned long long hit_largest, hit_cached, hit_rbtree; + unsigned long long hit_total, total_ext; int ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; @@ -1885,10 +1886,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) #define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) -#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext)) -#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree)) -#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest)) -#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached)) +#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) +#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) +#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) +#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached)) #define stat_inc_inline_xattr(inode) \ do { \ if (f2fs_has_inline_xattr(inode)) \ From a43f7ec327b0d86cbb80d0841673038c0706e714 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:19:24 +0800 Subject: [PATCH 28/54] f2fs: fix to avoid redundant searching in dirty map during gc When doing gc, we search a victim in dirty map, starting from position of last victim, we will reset the current searching position until we touch the end of dirty map, and then search the whole diryt map. So sometimes we will search the range [victim, last] twice, it's redundant, this patch avoids this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 343b096cb654..e5c255ba227b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -257,6 +257,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; unsigned int secno, max_cost; + unsigned int last_segment = MAIN_SEGS(sbi); int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -277,9 +278,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); - if (segno >= MAIN_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); + if (segno >= last_segment) { if (sbi->last_victim[p.gc_mode]) { + last_segment = sbi->last_victim[p.gc_mode]; sbi->last_victim[p.gc_mode] = 0; p.offset = 0; continue; From 3342bb303bf48dd8bb5ac94c3356489ff53e4d04 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:20:40 +0800 Subject: [PATCH 29/54] f2fs: skip searching dirty map if dirty segment is not exist When searching victim during gc, if there are no dirty segments in filesystem, we will still take the time to search the whole dirty segment map, it's not needed, it's better to skip in this condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e5c255ba227b..d844a8028527 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -268,6 +268,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); + if (p.max_search == 0) + goto out; + if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -329,6 +332,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); } +out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; From d530d4d8e237f4d12c93bb76df40b69b8b8a1dcd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:22:44 +0800 Subject: [PATCH 30/54] f2fs: support synchronous gc in ioctl This patch drops in batches gc triggered through ioctl, since user can easily control the gc by designing the loop around the ->ioctl. We support synchronous gc by forcing using FG_GC in f2fs_gc, so with it, user can make sure that in this round all blocks gced were persistent in the device until ioctl returned. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 22 +++++++++------------- fs/f2fs/gc.c | 21 +++++++++++++-------- fs/f2fs/gc.h | 6 ------ fs/f2fs/segment.c | 2 +- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1fa0555d1c2c..1b2fc2389a56 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1830,7 +1830,7 @@ int f2fs_release_page(struct page *, gfp_t); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); -int f2fs_gc(struct f2fs_sb_info *); +int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a350c2aeccae..c26996646bc7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1618,29 +1618,25 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - __u32 i, count; + __u32 sync; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (get_user(count, (__u32 __user *)arg)) + if (get_user(sync, (__u32 __user *)arg)) return -EFAULT; - if (!count || count > F2FS_BATCH_GC_MAX_NUM) - return -EINVAL; + if (f2fs_readonly(sbi->sb)) + return -EROFS; - for (i = 0; i < count; i++) { + if (!sync) { if (!mutex_trylock(&sbi->gc_mutex)) - break; - - if (f2fs_gc(sbi)) - break; + return -EBUSY; + } else { + mutex_lock(&sbi->gc_mutex); } - if (put_user(i, (__u32 __user *)arg)) - return -EFAULT; - - return 0; + return f2fs_gc(sbi, sync); } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d844a8028527..830d27770a32 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi)) + if (f2fs_gc(sbi, false)) wait_ms = gc_th->no_gc_sleep_time; /* balancing f2fs's metadata periodically */ @@ -803,12 +803,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return nfree; } -int f2fs_gc(struct f2fs_sb_info *sbi) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { unsigned int segno, i; - int gc_type = BG_GC; + int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; - int ret = -1; + int ret = -EINVAL; struct cp_control cpc; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), @@ -855,15 +855,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi) if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (has_not_enough_free_secs(sbi, sec_freed)) - goto gc_more; + if (!sync) { + if (has_not_enough_free_secs(sbi, sec_freed)) + goto gc_more; - if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); + } stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); + + if (sync) + ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c5a055b3376e..b4a65be9f7d3 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -19,12 +19,6 @@ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ -/* - * with this macro, we can control the max time we do garbage collection, - * when user triggers batch mode gc by ioctl. - */ -#define F2FS_BATCH_GC_MAX_NUM 16 - /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0ceff2891625..6b8edf21a152 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -295,7 +295,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi); + f2fs_gc(sbi, false); } } From 456b88e4d15de833165cce67c2cdf998139e2fc1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 5 Oct 2015 22:24:19 +0800 Subject: [PATCH 31/54] f2fs: introduce a new ioctl F2FS_IOC_WRITE_CHECKPOINT This patch introduce a new ioctl for those users who want to trigger checkpoint from userspace through ioctl. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1b2fc2389a56..18e590248bfc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -231,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c26996646bc7..b3985a680521 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1639,6 +1639,27 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return f2fs_gc(sbi, sync); } +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct cp_control cpc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + cpc.reason = __get_cp_reason(sbi); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1670,6 +1691,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_WRITE_CHECKPOINT: + return f2fs_ioc_write_checkpoint(filp, arg); default: return -ENOTTY; } From 6aefd93b01379bf0b62f8b38dcf7a21397893833 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 11:02:54 -0700 Subject: [PATCH 32/54] f2fs: introduce background_gc=sync mount option This patch introduce background_gc=sync enabling synchronous cleaning in background. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 3 ++- fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 2 +- fs/f2fs/segment.h | 4 +++- fs/f2fs/super.c | 21 +++++++++++++++------ 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e2d5105b7214..b102b436563e 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,7 +102,8 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. + will be truned off. If background_gc=sync, it will turn + on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 18e590248bfc..00bd47045c1e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -53,6 +53,7 @@ #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 +#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 830d27770a32..e627c19f7301 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, false)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; /* balancing f2fs's metadata periodically */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a294da70a7b5..e9afb5884312 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -137,10 +137,12 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. + * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, - FG_GC + FG_GC, + FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 16442ec89489..ba058d08cb33 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -292,11 +292,16 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; - if (strlen(name) == 2 && !strncmp(name, "on", 2)) + if (strlen(name) == 2 && !strncmp(name, "on", 2)) { set_opt(sbi, BG_GC); - else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { clear_opt(sbi, BG_GC); - else { + clear_opt(sbi, FORCE_FG_GC); + } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { + set_opt(sbi, BG_GC); + set_opt(sbi, FORCE_FG_GC); + } else { kfree(name); return -EINVAL; } @@ -631,10 +636,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) - seq_printf(seq, ",background_gc=%s", "on"); - else + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { + if (test_opt(sbi, FORCE_FG_GC)) + seq_printf(seq, ",background_gc=%s", "sync"); + else + seq_printf(seq, ",background_gc=%s", "on"); + } else { seq_printf(seq, ",background_gc=%s", "off"); + } if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) From 5c2674347466d5c2d5169214e95f4ad6dc09e9b6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 11:32:34 -0700 Subject: [PATCH 33/54] f2fs: add a tracepoint for background gc This patch introduces a tracepoint to monitor background gc behaviors. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 +++ include/trace/events/f2fs.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e627c19f7301..e7cec86f0747 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,6 +77,9 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 6aa63d963af4..7de751d5763b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -514,6 +514,34 @@ TRACE_EVENT(f2fs_map_blocks, __entry->ret) ); +TRACE_EVENT(f2fs_background_gc, + + TP_PROTO(struct super_block *sb, long wait_ms, + unsigned int prefree, unsigned int free), + + TP_ARGS(sb, wait_ms, prefree, free), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(long, wait_ms) + __field(unsigned int, prefree) + __field(unsigned int, free) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->wait_ms = wait_ms; + __entry->prefree = prefree; + __entry->free = free; + ), + + TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", + show_dev(__entry), + __entry->wait_ms, + __entry->prefree, + __entry->free) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, From 60b99b486b568c13cbb7caa83cf8a12af7665f1e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Oct 2015 14:49:57 -0700 Subject: [PATCH 34/54] f2fs: introduce a periodic checkpoint flow This patch introduces a periodic checkpoint feature. Note that, this is not enforcing to conduct checkpoints very strictly in terms of trigger timing, instead just hope to help user experiences. The default value is 60 seconds. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/checkpoint.c | 3 +++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 4 +++- fs/f2fs/super.c | 5 +++++ 5 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2c4cc42006e8..e066281dfd4e 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -80,3 +80,9 @@ Date: February 2015 Contact: "Jaegeuk Kim" Description: Controls the trimming rate in batch mode. + +What: /sys/fs/f2fs//cp_interval +Date: October 2015 +Contact: "Jaegeuk Kim" +Description: + Controls the checkpoint timing. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ff53405aee39..0569097dbd7a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1114,6 +1114,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason == CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); + + /* do checkpoint periodically */ + sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); out: mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 00bd47045c1e..aad4720c516e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -124,6 +124,7 @@ enum { (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define DEF_CP_INTERVAL 60 /* 60 secs */ struct cp_control { int reason; @@ -734,6 +735,7 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; + long cp_expires, cp_interval; /* next expected periodic cp */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6b8edf21a152..1d86a35ae9fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -315,7 +316,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || - !available_free_memory(sbi, INO_ENTRIES)) + !available_free_memory(sbi, INO_ENTRIES) || + jiffies > sbi->cp_expires) f2fs_sync_fs(sbi->sb, true); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ba058d08cb33..cb23d85a4ed3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -215,6 +215,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -231,6 +232,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(cp_interval), NULL, }; @@ -1014,6 +1016,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->cp_interval = DEF_CP_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1350,6 +1353,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_commit_super(sbi, true); } + sbi->cp_expires = round_jiffies_up(jiffies); + return 0; free_kobj: From 6066d8cdb6dd0fd51ccd96027f6ae8e6b3b5adff Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Oct 2015 16:42:55 -0700 Subject: [PATCH 35/54] f2fs: merge meta writes as many possible This patch tries to merge IOs as many as possible when background flusher conducts flushing the dirty meta pages. [Before] ... 2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124320, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124560, size = 32768 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95720, size = 987136 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123928, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123944, size = 8192 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123968, size = 45056 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124064, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 97648, size = 1007616 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123776, size = 8192 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123800, size = 32768 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 124624, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 99616, size = 921600 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123608, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123624, size = 77824 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123792, size = 4096 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 123864, size = 32768 ... [After] ... f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 92168, size = 892928 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 93912, size = 753664 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 95384, size = 716800 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 96784, size = 712704 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104160, size = 364544 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 104872, size = 356352 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 105568, size = 278528 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106112, size = 319488 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 106736, size = 258048 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107240, size = 270336 f2fs_submit_write_bio: dev = (8,18), WRITE_SYNC(MP), META, sector = 107768, size = 180224 ... Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0569097dbd7a..0a9ec4228342 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -257,7 +257,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { @@ -277,6 +277,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (prev == LONG_MAX) + prev = page->index - 1; + if (nr_to_write != LONG_MAX && page->index != prev + 1) { + pagevec_release(&pvec); + goto stop; + } + lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -297,13 +304,14 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, break; } nwritten++; + prev = page->index; if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); cond_resched(); } - +stop: if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); From c912a8298c16ef15aa2b7203022c935f439f488b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Oct 2015 09:46:37 -0700 Subject: [PATCH 36/54] f2fs: add F2FS_GOING_DOWN_METAFLUSH to test power-failure This patch introduces F2FS_GOING_DOWN_METAFLUSH which flushes meta pages like SSA blocks and then blocks all the writes. This can be used by power-failure tests. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index aad4720c516e..f05ae22558e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ +#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b3985a680521..6d3cfd55c3e4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1498,6 +1498,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi); break; + case F2FS_GOING_DOWN_METAFLUSH: + sync_meta_pages(sbi, META, LONG_MAX); + f2fs_stop_checkpoint(sbi); + break; default: return -EINVAL; } From a125702326d9c3b753fe9c9b9727d3b3dd1cba4a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 8 Oct 2015 10:40:07 -0700 Subject: [PATCH 37/54] Revert "f2fs: do not skip dentry block writes" The periodic checkpoint can resolve the previous issue. So, now we can use this again to improve the reported performance regression: https://lkml.org/lkml/2015/10/8/20 This reverts commit 15bec0ff5a9ba6d203178fa8772259df6207942a. --- fs/f2fs/data.c | 5 +++++ fs/f2fs/node.c | 5 +++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.h | 4 +++- 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a903423e4cd5..bc04e9201fd6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1340,6 +1340,11 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1fe49ca20757..4d9bedfe101c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,6 +52,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->wb.dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == INO_ENTRIES) { int i; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 51c62edf2e89..7427e956ad81 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -118,6 +118,7 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9afb5884312..ee44d346ea44 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -699,7 +699,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; - if (type == NODE) + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) return MAX_BIO_BLOCKS(sbi); From 6e2c64ad7cebf8740c5e1241de374c6b6ea80f81 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Oct 2015 12:28:41 -0700 Subject: [PATCH 38/54] f2fs: fix SSA updates resulting in corruption The f2fs_collapse_range and f2fs_insert_range changes the block addresses directly. But that can cause uncovered SSA updates. In that case, we need to give up to change the block addresses and do buffered writes to keep filesystem consistency. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++ fs/f2fs/file.c | 217 +++++++++++++++++++++------------------------- fs/f2fs/segment.c | 33 ++++++- 3 files changed, 142 insertions(+), 119 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f05ae22558e2..6f2310ca3910 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1233,6 +1233,16 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline void f2fs_copy_page(struct page *src, struct page *dst) +{ + char *src_kaddr = kmap(src); + char *dst_kaddr = kmap(dst); + + memcpy(dst_kaddr, src_kaddr, PAGE_SIZE); + kunmap(dst); + kunmap(src); +} + static inline void f2fs_put_page(struct page *page, int unlock) { if (!page) @@ -1754,6 +1764,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); +bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6d3cfd55c3e4..8d5583b21e5d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -826,86 +826,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int __exchange_data_block(struct inode *inode, pgoff_t src, + pgoff_t dst, bool full) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; + block_t new_addr; + bool do_replace = false; + int ret; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + return ret; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + if (!is_checkpointed_data(sbi, new_addr)) { + dn.data_blkaddr = NULL_ADDR; + /* do not invalidate this block address */ + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + do_replace = true; + } + f2fs_put_dnode(&dn); + } + + if (new_addr == NULL_ADDR) + return full ? truncate_hole(inode, dst, dst + 1) : 0; + + if (do_replace) { + struct page *ipage = get_node_page(sbi, inode->i_ino); + struct node_info ni; + + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto err_out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, dst); + if (ret) + goto err_out; + + truncate_data_blocks_range(&dn, 1); + + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true); + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(inode, src); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(inode, NULL, dst, false); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); + f2fs_put_page(psrc, 1); + + return truncate_hole(inode, src, src + 1); + } + return 0; + +err_out: + if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + f2fs_put_dnode(&dn); + } + return ret; +} + +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; int ret = 0; for (; end < nrpages; start++, end++) { - block_t new_addr, old_addr; - + f2fs_balance_fs(sbi); f2fs_lock_op(sbi); - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); - } - - if (new_addr == NULL_ADDR) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - f2fs_unlock_op(sbi); - continue; - } - - if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - continue; - } else { - truncate_data_blocks_range(&dn, 1); - } - - f2fs_put_dnode(&dn); - } else { - struct page *ipage; - - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, start); - if (ret) - goto out; - - old_addr = dn.data_blkaddr; - if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - invalidate_blocks(sbi, old_addr); - - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - } else if (new_addr != NEW_ADDR) { - struct node_info ni; - - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } - - f2fs_put_dnode(&dn); - } + ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); + if (ret) + break; } - return 0; -out: - f2fs_unlock_op(sbi); return ret; } @@ -944,7 +958,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + new_size = i_size_read(inode) - len; + truncate_pagecache(inode, new_size); ret = truncate_blocks(inode, new_size, true); if (!ret) @@ -1067,7 +1086,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t pg_start, pg_end, delta, nrpages, idx; loff_t new_size; - int ret; + int ret = 0; new_size = i_size_read(inode) + len; if (new_size > inode->i_sb->s_maxbytes) @@ -1105,57 +1124,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { - struct dnode_of_data dn; - struct page *ipage; - block_t new_addr, old_addr; - f2fs_lock_op(sbi); - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); - if (ret && ret != -ENOENT) { - goto out; - } else if (ret == -ENOENT) { - goto next; - } else if (dn.data_blkaddr == NULL_ADDR) { - f2fs_put_dnode(&dn); - goto next; - } else { - new_addr = dn.data_blkaddr; - truncate_data_blocks_range(&dn, 1); - f2fs_put_dnode(&dn); - } - - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, idx + delta); - if (ret) - goto out; - - old_addr = dn.data_blkaddr; - f2fs_bug_on(sbi, old_addr != NEW_ADDR); - - if (new_addr != NEW_ADDR) { - struct node_info ni; - - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, old_addr, new_addr, - ni.version, true); - } - f2fs_put_dnode(&dn); -next: + ret = __exchange_data_block(inode, idx, idx + delta, false); f2fs_unlock_op(sbi); + if (ret) + break; } - i_size_write(inode, new_size); - return 0; -out: - f2fs_unlock_op(sbi); + /* write out all moved pages, if possible */ + filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + truncate_pagecache(inode, offset); + + if (!ret) + i_size_write(inode, new_size); return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1d86a35ae9fe..581a9af549ff 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -768,6 +768,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) mutex_unlock(&sit_i->sentry_lock); } +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int segno, offset; + struct seg_entry *se; + bool is_cp = false; + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + return true; + + mutex_lock(&sit_i->sentry_lock); + + segno = GET_SEGNO(sbi, blkaddr); + se = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + is_cp = true; + + mutex_unlock(&sit_i->sentry_lock); + + return is_cp; +} + /* * This function should be resided under the curseg_mutex lock */ @@ -1370,7 +1394,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); + if (!recover_curseg) + update_sit_entry(sbi, new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); + locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { From a56c7c6fb3c60857c1335bcb8b914e6f65655486 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 9 Oct 2015 15:11:38 -0700 Subject: [PATCH 39/54] f2fs: set GFP_NOFS for grab_cache_page For normal inodes, their pages are allocated with __GFP_FS, which can cause filesystem calls when reclaiming memory. This can incur a dead lock condition accordingly. So, this patch addresses this problem by introducing f2fs_grab_cache_page(.., bool for_write), which calls grab_cache_page_write_begin() with AOP_FLAG_NOFS. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 16 +++++++++------- fs/f2fs/dir.c | 6 +++--- fs/f2fs/f2fs.h | 12 ++++++++++-- fs/f2fs/file.c | 6 +++--- fs/f2fs/gc.c | 6 +++--- 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bc04e9201fd6..d4f1c74bab26 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -275,7 +275,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) return f2fs_reserve_block(dn, index); } -struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int rw, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -292,7 +293,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return read_mapping_page(mapping, index, NULL); - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) return ERR_PTR(-ENOMEM); @@ -352,7 +353,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, false); if (IS_ERR(page)) return page; @@ -372,12 +373,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write) { struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -411,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released @@ -439,7 +441,7 @@ struct page *get_new_data_page(struct inode *inode, } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC); + page = get_read_data_page(inode, index, READ_SYNC, true); if (IS_ERR(page)) goto repeat; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 6726c4a5efa2..7c1678ba8f92 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -258,7 +258,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) if (f2fs_has_inline_dentry(dir)) return f2fs_parent_inline_dir(dir, p); - page = get_lock_data_page(dir, 0); + page = get_lock_data_page(dir, 0, false); if (IS_ERR(page)) return NULL; @@ -740,7 +740,7 @@ bool f2fs_empty_dir(struct inode *dir) return f2fs_empty_inline_dir(dir); for (bidx = 0; bidx < nblock; bidx++) { - dentry_page = get_lock_data_page(dir, bidx); + dentry_page = get_lock_data_page(dir, bidx, false); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) continue; @@ -854,7 +854,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); for (; n < npages; n++) { - dentry_page = get_lock_data_page(inode, n); + dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6f2310ca3910..6ba5a599f1f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1233,6 +1233,14 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) return sbi->total_valid_inode_count; } +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + if (!for_write) + return grab_cache_page(mapping, index); + return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); @@ -1831,9 +1839,9 @@ void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t); +struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8d5583b21e5d..5d2a2ee35742 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -505,14 +505,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); return 0; } - page = get_lock_data_page(inode, index); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) return 0; truncate_out: @@ -879,7 +879,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src, } else { struct page *psrc, *pdst; - psrc = get_lock_data_page(inode, src); + psrc = get_lock_data_page(inode, src, true); if (IS_ERR(psrc)) return PTR_ERR(psrc); pdst = get_new_data_page(inode, NULL, dst, false); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e7cec86f0747..94278dbbdf7c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) int err; /* do not read out */ - page = grab_cache_page(inode->i_mapping, bidx); + page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) return; @@ -620,7 +620,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) { struct page *page; - page = get_lock_data_page(inode, bidx); + page = get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) return; @@ -714,7 +714,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA); + start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { iput(inode); continue; From b8c2940048adf4b2fcc5ae738f2bd4821ebf6a8a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:02:26 +0800 Subject: [PATCH 40/54] f2fs: add a tracepoint for f2fs_read_data_pages This patch adds a tracepoint for f2fs_read_data_pages to trace when pages are readahead by VFS. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 +++ include/trace/events/f2fs.h | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d4f1c74bab26..228537657723 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1032,6 +1032,9 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; + struct page *page = list_entry(pages->prev, struct page, lru); + + trace_f2fs_readpages(inode, page, nr_pages); /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7de751d5763b..00b4a6308249 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1028,6 +1028,32 @@ TRACE_EVENT(f2fs_writepages, __entry->for_sync) ); +TRACE_EVENT(f2fs_readpages, + + TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage), + + TP_ARGS(inode, page, nrpage), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(pgoff_t, start) + __field(unsigned int, nrpage) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = page->index; + __entry->nrpage = nrpage; + ), + + TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u", + show_dev_ino(__entry), + (unsigned long)__entry->start, + __entry->nrpage) +); + TRACE_EVENT(f2fs_write_checkpoint, TP_PROTO(struct super_block *sb, int reason, char *msg), From 2b947003fa98d5a39f3b21214380d0b1daf750b5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:04:21 +0800 Subject: [PATCH 41/54] f2fs: don't tag REQ_META for temporary non-meta pages In recovery or checkpoint flow, we grab pages temperarily in meta inode's mapping for caching temperary data, actually, datas in these pages were not meta data of f2fs, but still we tag them with REQ_META flag. However, lower device like eMMC may do some optimization for data of such type. So in order to avoid wrong optimization, we'd better remove such flag for temperary non-meta pages. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 20 +++++++++++++++++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 2 +- fs/f2fs/recovery.c | 4 ++-- fs/f2fs/segment.c | 3 +++ 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0a9ec4228342..60a95992384f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -47,7 +47,8 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) /* * We guarantee no failure on the returned page. */ -struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, + bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; @@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) .blk_addr = index, .encrypted_page = NULL, }; + + if (unlikely(!is_meta)) + fio.rw &= ~REQ_META; repeat: page = grab_cache_page(mapping, index); if (!page) { @@ -91,6 +95,17 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return page; } +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, true); +} + +/* for POR only */ +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +{ + return __get_meta_page(sbi, index, false); +} + bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -137,6 +152,9 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type .encrypted_page = NULL, }; + if (unlikely(type == META_POR)) + fio.rw &= ~REQ_META; + for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6ba5a599f1f6..6c0e83c5ced5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1806,6 +1806,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4d9bedfe101c..c61dfb695308 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1807,7 +1807,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, ra_meta_pages(sbi, addr, nrpages, META_POR); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = get_meta_page(sbi, idx); + struct page *page = get_tmp_page(sbi, idx); rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index c5daec503e7f..75dbc0708c7b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -188,7 +188,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -480,7 +480,7 @@ static int recover_data(struct f2fs_sb_info *sbi, ra_meta_pages_cond(sbi, blkaddr); - page = get_meta_page(sbi, blkaddr); + page = get_tmp_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 581a9af549ff..13aa7a6eee53 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1317,6 +1317,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .encrypted_page = NULL, }; + if (unlikely(page->index >= MAIN_BLKADDR(sbi))) + fio.rw &= ~REQ_META; + set_page_writeback(page); f2fs_submit_page_mbio(&fio); } From 26879fb101f28c554294eaf25ac7817a2825b180 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:05:59 +0800 Subject: [PATCH 42/54] f2fs: support lower priority asynchronous readahead in ra_meta_pages Now, we use ra_meta_pages to reads continuous physical blocks as much as possible to improve performance of following reads. However, ra_meta_pages uses a synchronous readahead approach by submitting bio with READ, as READ is with high priority, it can not be used in the case of preloading blocks, and it's not sure when these RAed pages will be used. This patch supports asynchronous readahead in ra_meta_pages by tagging bio with READA flag in order to allow preloading. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 9 +++++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 6 +++--- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 60a95992384f..f661d80474be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -140,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync) { block_t prev_blk_addr = 0; struct page *page; @@ -148,7 +149,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; @@ -214,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -521,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6c0e83c5ced5..5f372c03de32 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1808,7 +1808,7 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94278dbbdf7c..3f19634e2aa5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -840,7 +840,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA); + META_SSA, true); for (i = 0; i < sbi->segs_per_sec; i++) { /* diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c61dfb695308..ad98e35f3fcb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1529,7 +1529,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) return; /* readahead nat pages to be scanned */ - ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, + META_NAT, true); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1804,7 +1805,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, nrpages = min(last_offset - i, bio_blocks); /* readahead node pages */ - ra_meta_pages(sbi, addr, nrpages, META_POR); + ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { struct page *page = get_tmp_page(sbi, idx); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 75dbc0708c7b..cbf74f47cce8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,7 +180,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR); + ra_meta_pages(sbi, blkaddr, 1, META_POR, true); while (1) { struct fsync_inode_entry *entry; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13aa7a6eee53..5337fd6fdc32 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1621,7 +1621,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (npages >= 2) ra_meta_pages(sbi, start_sum_block(sbi), npages, - META_CP); + META_CP, true); /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) @@ -1631,7 +1631,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) if (__exist_node_summaries(sbi)) ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), - NR_CURSEG_TYPE - type, META_CP); + NR_CURSEG_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); @@ -2118,7 +2118,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int nrpages = MAX_BIO_BLOCKS(sbi); do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; From 2db2388fcf0d880bd7160264274ed873baec4f0d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:07:33 +0800 Subject: [PATCH 43/54] f2fs: readahead for free nids building When there is no free nid in nid cache, all new node allocaters stop their job to wait for reloading of free nids, however reloading is synchronous as we will read 4 NAT pages for building nid cache, it cause the long latency. This patch tries to readahead more NAT pages with READA request flag after reloading of free nids. It helps to improve performance when users allocate node id intensively. Env: Sandisk 32G sd card time for i in `seq 1 60000`; { echo -n > /mnt/f2fs/$i; echo XXXXXX > /mnt/f2fs/$i;} Before: real 0m2.814s user 0m1.220s sys 0m1.536s After: real 0m2.711s user 0m1.136s sys 0m1.568s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ad98e35f3fcb..14f46067817a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1560,6 +1560,9 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), + FREE_NID_PAGES, META_NAT, false); } /* From ea1a29a0bdfffd56ca98335c0655308e8d7d0e22 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 12 Oct 2015 17:08:48 +0800 Subject: [PATCH 44/54] f2fs: export ra_nid_pages to sysfs After finishing building free nid cache, we will try to readahead asynchronously 4 more pages for the next reloading, the count of readahead nid pages is fixed. In some case, like SMR drive, read less sectors with fixed count each time we trigger RA may be low efficient, since we will face high seeking overhead, so we'd better let user to configure this parameter from sysfs in specific workload. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 3 ++- fs/f2fs/node.h | 4 +++- fs/f2fs/super.c | 2 ++ 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index e066281dfd4e..0345f2d1c727 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -86,3 +86,9 @@ Date: October 2015 Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. + +What: /sys/fs/f2fs//ra_nid_pages +Date: October 2015 +Contact: "Chao Yu" +Description: + Controls the count of nid pages to be readaheaded. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5f372c03de32..94cf6bc1303f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -510,6 +510,7 @@ struct f2fs_nm_info { nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ + unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 14f46067817a..7bcbc6e9c40d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1562,7 +1562,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) mutex_unlock(&curseg->curseg_mutex); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), - FREE_NID_PAGES, META_NAT, false); + nm_i->ra_nid_pages, META_NAT, false); } /* @@ -2005,6 +2005,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; + nm_i->ra_nid_pages = DEF_RA_NID_PAGES; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7427e956ad81..e4fffd2d98c4 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -14,9 +14,11 @@ /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) -/* # of pages to perform readahead before building free nids */ +/* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 4 +#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ + /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb23d85a4ed3..3a65e0132352 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -213,6 +213,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); @@ -232,6 +233,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), ATTR_LIST(cp_interval), NULL, }; From 08b39fbd59781729da9fb6367decaf4804a22721 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 8 Oct 2015 13:27:34 +0800 Subject: [PATCH 45/54] f2fs crypto: fix racing of accessing encrypted page among different competitors Since we use different page cache (normally inode's page cache for R/W and meta inode's page cache for GC) to cache the same physical block which is belong to an encrypted inode. Writeback of these two page cache should be exclusive, but now we didn't handle writeback state well, so there may be potential racing problem: a) kworker: f2fs_gc: - f2fs_write_data_pages - f2fs_write_data_page - do_write_data_page - write_data_page - f2fs_submit_page_mbio (page#1 in inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) - gc_data_segment - move_encrypted_block - pagecache_get_page (page#2 in meta inode's page cache was cached with the invalid datas of physical block located in new blkaddr) - f2fs_submit_page_mbio (page#1 was submitted, later, page#2 with invalid data will be submitted) b) f2fs_gc: - gc_data_segment - move_encrypted_block - f2fs_submit_page_mbio (page#1 in meta inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) user thread: - f2fs_write_begin - f2fs_submit_page_bio (we submit the request to block layer to update page#2 in inode's page cache with physical block located in new blkaddr, so here we may read gabbage data from new blkaddr since GC hasn't writebacked the page#1 yet) This patch fixes above potential racing problem for encrypted inode. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +++++++++++--------- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 5 +++++ fs/f2fs/gc.c | 12 ++++++++++-- fs/f2fs/segment.c | 17 +++++++++++++++++ 5 files changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 228537657723..77dfc9eee380 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -956,21 +956,14 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - struct page *cpage; ctx = f2fs_get_crypto_ctx(inode); if (IS_ERR(ctx)) goto set_error_page; /* wait the page to be moved by cleaning */ - cpage = find_lock_page( - META_MAPPING(F2FS_I_SB(inode)), - block_nr); - if (cpage) { - f2fs_wait_on_page_writeback(cpage, - DATA); - f2fs_put_page(cpage, 1); - } + f2fs_wait_on_encrypted_page_writeback( + F2FS_I_SB(inode), block_nr); } bio = bio_alloc(GFP_KERNEL, @@ -1064,6 +1057,11 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), + fio->blk_addr); + fio->encrypted_page = f2fs_encrypt(inode, fio->page); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); @@ -1452,6 +1450,10 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_wait_on_page_writeback(page, DATA); + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (len == PAGE_CACHE_SIZE) goto out_update; if (PageUptodate(page)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94cf6bc1303f..c3443da2f991 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1792,6 +1792,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5d2a2ee35742..91c51a6d42dd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -87,6 +87,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); + + /* wait for GCed encrypted page writeback */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + /* if gced page is attached, don't write to cold segment */ clear_cold_data(page); out: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3f19634e2aa5..af7c24caeef9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -559,8 +559,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (err) goto out; - if (unlikely(dn.data_blkaddr == NULL_ADDR)) + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + ClearPageUptodate(page); goto put_out; + } + + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); @@ -589,7 +597,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) goto put_page_out; set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, META); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5337fd6fdc32..f37c21233b5c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1484,6 +1484,23 @@ void f2fs_wait_on_page_writeback(struct page *page, } } +void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + struct page *cpage; + + if (blkaddr == NEW_ADDR) + return; + + f2fs_bug_on(sbi, blkaddr == NULL_ADDR); + + cpage = find_lock_page(META_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_put_page(cpage, 1); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); From 84e4214f0868ae77771837d0ed4cc6eff10738ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 13 Oct 2015 10:00:53 -0700 Subject: [PATCH 46/54] f2fs: relocate the tracepoint for background_gc Once f2fs_gc is done, wait_ms is changed once more. So, its tracepoint would be located after it. Reported-by: He YunLei Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index af7c24caeef9..fedbf67a0842 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -77,13 +77,13 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); - trace_f2fs_background_gc(sbi->sb, wait_ms, - prefree_segments(sbi), free_segments(sbi)); - /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) wait_ms = gc_th->no_gc_sleep_time; + trace_f2fs_background_gc(sbi->sb, wait_ms, + prefree_segments(sbi), free_segments(sbi)); + /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); From 1d373a0ef7a7bc08f95ca820c627e961fb21e188 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 19 Oct 2015 10:29:51 -0700 Subject: [PATCH 47/54] f2fs: flush dirty data for bmap Users expect bmap will give allocated block addresses. Let's play likewise ext4. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 77dfc9eee380..b052e7c262ac 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1667,12 +1667,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + if (f2fs_has_inline_data(inode)) + return 0; + + /* make sure allocating whole blocks */ + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + filemap_write_and_wait(mapping); + return generic_block_bmap(mapping, block, get_data_block_bmap); } From 67f8cf3cee6f398d05de8333c04fea2ddb59c805 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 15 Oct 2015 11:34:49 -0700 Subject: [PATCH 48/54] f2fs: support fiemap for inline_data There is a FIEMAP_EXTENT_INLINE_DATA, pointed out by Marc. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/inline.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b052e7c262ac..972eab7ac071 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -771,6 +771,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); + if (ret != -EAGAIN) + return ret; + } + mutex_lock(&inode->i_mutex); if (len >= isize) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c3443da2f991..9db5500d63d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2054,6 +2054,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, struct f2fs_str *); +int f2fs_inline_data_fiemap(struct inode *, + struct fiemap_extent_info *, __u64, __u64); /* * shrinker.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b18e23319c3..bda7126466c0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -12,6 +12,7 @@ #include #include "f2fs.h" +#include "node.h" bool f2fs_may_inline_data(struct inode *inode) { @@ -570,3 +571,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, f2fs_put_page(ipage, 1); return 0; } + +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) +{ + __u64 byteaddr, ilen; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; + struct node_info ni; + struct page *ipage; + int err = 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + if (!f2fs_has_inline_data(inode)) { + err = -EAGAIN; + goto out; + } + + ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + if (start >= ilen) + goto out; + if (start + len < ilen) + ilen = start + len; + ilen -= start; + + get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; + byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); +out: + f2fs_put_page(ipage, 1); + return err; +} From f96999c35f46fa9bce8a3a2812cd0a28fcde5903 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 20 Oct 2015 15:17:19 -0700 Subject: [PATCH 49/54] f2fs: refactor __find_rev_next_{zero}_bit This patch refactors __find_rev_next_{zero}_bit which was disabled previously due to bugs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 106 +++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 57 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f37c21233b5c..7835e41868f0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; +static unsigned long __reverse_ulong(unsigned char *str) +{ + unsigned long tmp = 0; + int shift = 24, idx = 0; + +#if BITS_PER_LONG == 64 + shift = 56; +#endif + while (shift >= 0) { + tmp |= (unsigned long)str[idx++] << shift; + shift -= BITS_PER_BYTE; + } + return tmp; +} + /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. @@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word) int num = 0; #if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { + if ((word & 0xffffffff00000000UL) == 0) num += 32; + else word >>= 32; - } #endif - if ((word & 0xffff) == 0) { + if ((word & 0xffff0000) == 0) num += 16; + else word >>= 16; - } - if ((word & 0xff) == 0) { + + if ((word & 0xff00) == 0) num += 8; + else word >>= 8; - } + if ((word & 0xf0) == 0) num += 4; else word >>= 4; + if ((word & 0xc) == 0) num += 2; else word >>= 2; + if ((word & 0x2) == 0) num += 1; return num; @@ -68,26 +87,16 @@ static inline unsigned long __reverse_ffs(unsigned long word) * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * Example: - * LSB <--> MSB - * f2fs_set_bit(0, bitmap) => 0000 0001 - * f2fs_set_bit(7, bitmap) => 1000 0000 + * MSB <--> LSB + * f2fs_set_bit(0, bitmap) => 1000 0000 + * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (!f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -97,14 +106,9 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~0UL << quot; - submask = (unsigned char)(0xff << rest) >> rest; - submask <<= quot; - mask &= submask; - tmp &= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) goto found_first; if (tmp) @@ -112,42 +116,34 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG-1)) { - tmp = *(p++); + tmp = __reverse_ulong((unsigned char *)p); if (tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + tmp &= (~0UL << (BITS_PER_LONG - size)); + if (!tmp) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffs(tmp); -#endif } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { - while (f2fs_test_bit(offset, (unsigned char *)addr)) - offset++; - - if (offset > size) - offset = size; - - return offset; -#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; - unsigned long mask, submask; - unsigned long quot, rest; if (offset >= size) return size; @@ -157,40 +153,36 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, if (!offset) goto aligned; - tmp = *(p++); - quot = (offset >> 3) << 3; - rest = offset & 0x7; - mask = ~(~0UL << quot); - submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); - submask <<= quot; - mask += submask; - tmp |= mask; + tmp = __reverse_ulong((unsigned char *)p); + tmp |= ~((~0UL << offset) >> offset); + if (size < BITS_PER_LONG) goto found_first; - if (~tmp) + if (tmp != ~0UL) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; + p++; aligned: while (size & ~(BITS_PER_LONG - 1)) { - tmp = *(p++); - if (~tmp) + tmp = __reverse_ulong((unsigned char *)p); + if (tmp != ~0UL) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; + p++; } if (!size) return result; - tmp = *p; + tmp = __reverse_ulong((unsigned char *)p); found_first: - tmp |= ~0UL << size; - if (tmp == ~0UL) /* Are any bits zero? */ + tmp |= ~(~0UL << (BITS_PER_LONG - size)); + if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + __reverse_ffz(tmp); -#endif } void register_inmem_page(struct inode *inode, struct page *page) From d7b8b384b02f52b20a538e741c75c5d50c0e131f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Oct 2015 18:49:50 -0700 Subject: [PATCH 50/54] f2fs: fix leakage of inmemory atomic pages If we got failure during commit_atomic_write, abort_volatile_write will be called, but will not drop the inmemory pages due to no FI_ATOMIC_FILE. Actually, there is no reason to check the flag in abort_volatile_write. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 91c51a6d42dd..a197215ad52b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1443,13 +1443,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - commit_inmem_pages(inode, true); - } - - if (f2fs_is_volatile_file(inode)) - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + commit_inmem_pages(inode, true); mnt_drop_write_file(filp); return ret; From 2b246fb0f60d318cec6901a0326b94b50d5e1dcb Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Oct 2015 19:00:31 -0700 Subject: [PATCH 51/54] f2fs: don't need to submit bio on error case If commit_atomic_write is failed, we don't need to submit any bio. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7835e41868f0..7efd96ad9aac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -249,11 +249,11 @@ int commit_inmem_pages(struct inode *inode, bool abort) trace_f2fs_commit_inmem_page(cur->page, INMEM); fio.page = cur->page; err = do_write_data_page(&fio); - submit_bio = true; if (err) { unlock_page(cur->page); break; } + submit_bio = true; } } else { trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); From 7fee740697e0d9a57d618b6fec79e4c4e09fd606 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:18:11 +0800 Subject: [PATCH 52/54] f2fs: fix to clear GCed flag for atomic written page Atomic write page can be GCed, after committing this kind of page, we should clear the GCed flag for it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7efd96ad9aac..f77b3258454a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -253,6 +253,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) unlock_page(cur->page); break; } + clear_cold_data(cur->page); submit_bio = true; } } else { From a6be014e1d28339ba7c745fc4ac1efdbf6e2c1a2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:23:08 +0800 Subject: [PATCH 53/54] f2fs: fix error path of ->symlink Now, in ->symlink of f2fs, we kept the fixed invoking order between f2fs_add_link and page_symlink since we should init node info firstly in f2fs_add_link, then such node info can be used in page_symlink. But we didn't fix to release meta info which was done before page_symlink in our error path, so this will leave us corrupt symlink entry in its parent's dentry page. Fix this issue by adding f2fs_unlink in the error path for removing such linking. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index dfa01c88b34b..e48b80c49090 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -410,11 +410,14 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, * If the symlink path is stored into inline_data, there is no * performance regression. */ - if (!err) + if (!err) { filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); - if (IS_DIRSYNC(dir)) - f2fs_sync_fs(sbi->sb, 1); + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); + } else { + f2fs_unlink(dir, dentry); + } kfree(sd); f2fs_fname_crypto_free_buffer(&disk_link); From beaa57dd986d4f398728c060692fc2452895cfd8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Oct 2015 18:24:12 +0800 Subject: [PATCH 54/54] f2fs: fix to skip shrinking extent nodes In f2fs_shrink_extent_tree we should stop shrink flow if we have already shrunk enough nodes in extent cache. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index a38ee9bec4ba..7ddba812e11b 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -620,7 +620,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) write_unlock(&et->lock); if (node_cnt + tree_cnt >= nr_shrink) - break; + goto unlock_out; } } unlock_out: