From 5576cd6ca555bd79e76a2d798aec44b28851d369 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 12 Jun 2014 13:25:01 +0800 Subject: [PATCH 01/40] f2fs: avoid unneeded SetPageUptodate in f2fs_write_end We have already set page update in ->write_begin, so we should remove redundant SetPageUptodate in ->write_end. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f8cf619edb5f..2eb2764173ca 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1016,7 +1016,6 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - SetPageUptodate(page); set_page_dirty(page); if (pos + copied > i_size_read(inode)) { From ca0a81b397b5f153ac2a9880c5e85b08b0447e4e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 12 Jun 2014 13:31:50 +0800 Subject: [PATCH 02/40] f2fs: avoid to truncate non-updated page partially After we call find_data_page in truncate_partial_data_page, we could not guarantee this page is updated or not as error may occurred in lower layer. We'd better check status of the page to avoid this no updated page be writebacked to device. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d8b96275092..36fa50587bb8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -380,13 +380,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) { - f2fs_put_page(page, 1); - return; - } + if (unlikely(!PageUptodate(page) || + page->mapping != inode->i_mapping)) + goto out; + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); + +out: f2fs_put_page(page, 1); } From 50732df02eefb39ab414ef655979c2c9b64ad21c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 19 Jun 2014 16:23:19 +0800 Subject: [PATCH 03/40] f2fs: support ->tmpfile() Add function f2fs_tmpfile() to support O_TMPFILE file creation, and modify logic of init_inode_metadata to enable linkat temp file. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 10 +++++++- fs/f2fs/namei.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a4addd72ebbd..3677f4124541 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -333,10 +333,12 @@ static int make_empty_dir(struct inode *inode, static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE) && + inode->i_nlink) { page = new_inode_page(inode, name); if (IS_ERR(page)) return page; @@ -370,6 +372,12 @@ static struct page *init_inode_metadata(struct inode *inode, */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { file_lost_pino(inode); + /* + * If link the tmpfile to alias through linkat path, + * we should remove this inode from orphan list. + */ + if (inode->i_nlink == 0) + remove_orphan_inode(sbi, inode->i_ino); inc_nlink(inode); } return page; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a6bdddc33ce2..5c08c546e4c8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -487,6 +488,68 @@ out: return err; } +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct page *page; + int err; + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + + page = new_inode_page(inode, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto remove_out; + } + + err = f2fs_init_acl(inode, dir, page); + if (err) + goto unlock_out; + + err = f2fs_init_security(inode, dir, NULL, page); + if (err) + goto unlock_out; + + f2fs_put_page(page, 1); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + mark_inode_dirty(inode); + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; +unlock_out: + f2fs_put_page(page, 1); +remove_out: + remove_orphan_inode(sbi, inode->i_ino); +out: + f2fs_unlock_op(sbi); + clear_nlink(inode); + unlock_new_inode(inode); + make_bad_inode(inode); + iput(inode); + alloc_nid_failed(sbi, inode->i_ino); + return err; +} + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -497,6 +560,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, From b97a9b5da891ab6aff5a6a19c569c9c4c5563d48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Jun 2014 21:37:02 -0700 Subject: [PATCH 04/40] f2fs: introduce f2fs_do_tmpfile for code consistency This patch adds f2fs_do_tmpfile to eliminate the redundant init_inode_metadata flow. Throught this, we can provide the consistent lock usage, e.g., fi->i_sem, and this will enable better debugging stuffs. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 27 ++++++++++++++++++++++++--- fs/f2fs/f2fs.h | 1 + fs/f2fs/namei.c | 30 ++++++++---------------------- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3677f4124541..fb7e01ed47dc 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -337,8 +337,7 @@ static struct page *init_inode_metadata(struct inode *inode, struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE) && - inode->i_nlink) { + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { page = new_inode_page(inode, name); if (IS_ERR(page)) return page; @@ -364,7 +363,8 @@ static struct page *init_inode_metadata(struct inode *inode, set_cold_node(inode, page); } - init_dent_inode(name, page); + if (name) + init_dent_inode(name, page); /* * This file should be checkpointed during fsync. @@ -537,6 +537,27 @@ fail: return err; } +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) +{ + struct page *page; + int err = 0; + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + /* we don't need to mark_inode_dirty now */ + update_inode(inode, page); + f2fs_put_page(page, 1); + + clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); +fail: + up_write(&F2FS_I(inode)->i_sem); + return err; +} + /* * It only removes the dentry from the dentry page,corresponding name * entry in name page does not need to be touched during deletion. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 58df97e174d0..ca421a8e47d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1136,6 +1136,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +int f2fs_do_tmpfile(struct inode *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5c08c546e4c8..1994d2e449d6 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -493,7 +493,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - struct page *page; int err; inode = f2fs_new_inode(dir, mode); @@ -508,38 +507,25 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) err = acquire_orphan_inode(sbi); if (err) goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + /* * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ add_orphan_inode(sbi, inode->i_ino); - - page = new_inode_page(inode, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto remove_out; - } - - err = f2fs_init_acl(inode, dir, page); - if (err) - goto unlock_out; - - err = f2fs_init_security(inode, dir, NULL, page); - if (err) - goto unlock_out; - - f2fs_put_page(page, 1); f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - mark_inode_dirty(inode); d_tmpfile(dentry, inode); unlock_new_inode(inode); return 0; -unlock_out: - f2fs_put_page(page, 1); -remove_out: - remove_orphan_inode(sbi, inode->i_ino); + +release_out: + release_orphan_inode(sbi); out: f2fs_unlock_op(sbi); clear_nlink(inode); From a014e037be26b5c9ee6fb4e49e7804141cf3bb89 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 20 Jun 2014 21:44:02 -0700 Subject: [PATCH 05/40] f2fs: clean up an unused parameter and assignment This patch cleans up simple unnecessary codes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/namei.c | 26 +++++++++----------------- fs/f2fs/node.c | 2 +- 4 files changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fb7e01ed47dc..087b03dbca9b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -338,7 +338,7 @@ static struct page *init_inode_metadata(struct inode *inode, int err; if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - page = new_inode_page(inode, name); + page = new_inode_page(inode); if (IS_ERR(page)) return page; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ca421a8e47d9..3f0291b840ef 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1174,7 +1174,7 @@ int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); void remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_inode_page(struct inode *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1994d2e449d6..89b0bd9420cc 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -23,14 +23,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); nid_t ino; struct inode *inode; bool nid_free = false; int err; - inode = new_inode(sb); + inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -103,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; nid_t ino = 0; int err; @@ -147,8 +145,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int err; f2fs_balance_fs(sbi); @@ -208,8 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode = dentry->d_inode; struct f2fs_dir_entry *de; struct page *page; @@ -243,8 +239,7 @@ fail: static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; size_t symlen = strlen(symname) + 1; int err; @@ -331,8 +326,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) static int f2fs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; int err = 0; @@ -370,8 +364,7 @@ out: static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; @@ -490,8 +483,7 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; int err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b697ccc9b0c..de709f0a445e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -845,7 +845,7 @@ void remove_inode_page(struct inode *inode) truncate_node(&dn); } -struct page *new_inode_page(struct inode *inode, const struct qstr *name) +struct page *new_inode_page(struct inode *inode) { struct dnode_of_data dn; From aec71382c68135261ef6efc3d8a96b7149939446 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 09:18:20 +0800 Subject: [PATCH 06/40] f2fs: refactor flush_nat_entries codes for reducing NAT writes Although building NAT journal in cursum reduce the read/write work for NAT block, but previous design leave us lower performance when write checkpoint frequently for these cases: 1. if journal in cursum has already full, it's a bit of waste that we flush all nat entries to page for persistence, but not to cache any entries. 2. if journal in cursum is not full, we fill nat entries to journal util journal is full, then flush the left dirty entries to disk without merge journaled entries, so these journaled entries may be flushed to disk at next checkpoint but lost chance to flushed last time. In this patch we merge dirty entries located in same NAT block to nat entry set, and linked all set to list, sorted ascending order by entries' count of set. Later we flush entries in sparse set into journal as many as we can, and then flush merged entries to disk. In this way we can not only gain in performance, but also save lifetime of flash device. In my testing environment, it shows this patch can help to reduce NAT block writes obviously. In hard disk test case: cost time of fsstress is stablely reduced by about 5%. 1. virtual machine + hard disk: fsstress -p 20 -n 200 -l 5 node num cp count nodes/cp based 4599.6 1803.0 2.551 patched 2714.6 1829.6 1.483 2. virtual machine + 32g micro SD card: fsstress -p 20 -n 200 -l 1 -w -f chown=0 -f creat=4 -f dwrite=0 -f fdatasync=4 -f fsync=4 -f link=0 -f mkdir=4 -f mknod=4 -f rename=5 -f rmdir=5 -f symlink=0 -f truncate=4 -f unlink=5 -f write=0 -S node num cp count nodes/cp based 84.5 43.7 1.933 patched 49.2 40.0 1.23 Our latency of merging op shows not bad when handling extreme case like: merging a great number of dirty nats: latency(ns) dirty nat count 3089219 24922 5129423 27422 4000250 24523 change log from v1: o fix wrong logic in add_nat_entry when grab a new nat entry set. o swith to create slab cache in create_node_manager_caches. o use GFP_ATOMIC instead of GFP_NOFS to avoid potential long latency. change log from v2: o make comment position more appropriate suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 + fs/f2fs/node.c | 257 +++++++++++++++++++++++++++++++++---------------- fs/f2fs/node.h | 7 ++ 3 files changed, 185 insertions(+), 81 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3f0291b840ef..ec480b1a6e33 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -256,6 +256,8 @@ struct f2fs_nm_info { unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ + struct list_head nat_entry_set; /* nat entry set list */ + unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index de709f0a445e..a90f51d32482 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -25,6 +25,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +static struct kmem_cache *nat_entry_set_slab; bool available_free_memory(struct f2fs_sb_info *sbi, int type) { @@ -90,12 +91,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_meta_page(sbi, src_off); - - /* Dirty src_page means that it is already the new target NAT page. */ - if (PageDirty(src_page)) - return src_page; - dst_page = grab_meta_page(sbi, dst_off); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1744,7 +1741,90 @@ skip: return err; } -static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) +static struct nat_entry_set *grab_nat_entry_set(void) +{ + struct nat_entry_set *nes = + f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + nes->entry_cnt = 0; + INIT_LIST_HEAD(&nes->set_list); + INIT_LIST_HEAD(&nes->entry_list); + return nes; +} + +static void release_nat_entry_set(struct nat_entry_set *nes, + struct f2fs_nm_info *nm_i) +{ + f2fs_bug_on(!list_empty(&nes->entry_list)); + + nm_i->dirty_nat_cnt -= nes->entry_cnt; + list_del(&nes->set_list); + kmem_cache_free(nat_entry_set_slab, nes); +} + +static void adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head) +{ + struct nat_entry_set *next = nes; + + if (list_is_last(&nes->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (nes->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&nes->set_list, &next->set_list); +} + +static void add_nat_entry(struct nat_entry *ne, struct list_head *head) +{ + struct nat_entry_set *nes; + nid_t start_nid = START_NID(ne->ni.nid); + + list_for_each_entry(nes, head, set_list) { + if (nes->start_nid == start_nid) { + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + adjust_nat_entry_set(nes, head); + return; + } + } + + nes = grab_nat_entry_set(); + + nes->start_nid = start_nid; + list_move_tail(&ne->list, &nes->entry_list); + nes->entry_cnt++; + list_add(&nes->set_list, head); +} + +static void merge_nats_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct list_head *dirty_list = &nm_i->dirty_nat_entries; + struct list_head *set_list = &nm_i->nat_entry_set; + struct nat_entry *ne, *tmp; + + write_lock(&nm_i->nat_tree_lock); + list_for_each_entry_safe(ne, tmp, dirty_list, list) { + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; + add_nat_entry(ne, set_list); + nm_i->dirty_nat_cnt++; + } + write_unlock(&nm_i->nat_tree_lock); +} + +static bool __has_cursum_space(struct f2fs_summary_block *sum, int size) +{ + if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES) + return true; + else + return false; +} + +static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1752,12 +1832,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) int i; mutex_lock(&curseg->curseg_mutex); - - if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) { - mutex_unlock(&curseg->curseg_mutex); - return false; - } - for (i = 0; i < nats_in_cursum(sum); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; @@ -1767,23 +1841,21 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) retry: write_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne) { - __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - continue; - } + if (ne) + goto found; + ne = grab_nat_entry(nm_i, nid); if (!ne) { write_unlock(&nm_i->nat_tree_lock); goto retry; } node_info_from_raw_nat(&ne->ni, &raw_ne); +found: __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); - return true; } /* @@ -1794,80 +1866,91 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry *ne, *cur; - struct page *page = NULL; - struct f2fs_nat_block *nat_blk = NULL; - nid_t start_nid = 0, end_nid = 0; - bool flushed; + struct nat_entry_set *nes, *tmp; + struct list_head *head = &nm_i->nat_entry_set; + bool to_journal = true; - flushed = flush_nats_in_journal(sbi); + /* merge nat entries of dirty list to nat entry set temporarily */ + merge_nats_in_set(sbi); - if (!flushed) - mutex_lock(&curseg->curseg_mutex); + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) { + remove_nats_in_journal(sbi); - /* 1) flush dirty nat caches */ - list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { - nid_t nid; - struct f2fs_nat_entry raw_ne; - int offset = -1; + /* + * merge nat entries of dirty list to nat entry set temporarily + */ + merge_nats_in_set(sbi); + } - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + if (!nm_i->dirty_nat_cnt) + return; - nid = nat_get_nid(ne); + /* + * there are two steps to flush nat entries: + * #1, flush nat entries to journal in current hot data summary block. + * #2, flush nat entries to nat page. + */ + list_for_each_entry_safe(nes, tmp, head, set_list) { + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page; + nid_t start_nid = nes->start_nid; - if (flushed) - goto to_nat_page; + if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) + to_journal = false; - /* if there is room for nat enries in curseg->sumpage */ - offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); - if (offset >= 0) { - raw_ne = nat_in_journal(sum, offset); - goto flush_now; - } -to_nat_page: - if (!page || (start_nid > nid || nid > end_nid)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; - } - start_nid = START_NID(nid); - end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1; - - /* - * get nat block with dirty flag, increased reference - * count, mapped and lock - */ + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); + f2fs_bug_on(!nat_blk); } - f2fs_bug_on(!nat_blk); - raw_ne = nat_blk->entries[nid - start_nid]; -flush_now: - raw_nat_from_node_info(&raw_ne, &ne->ni); + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; - if (offset < 0) { - nat_blk->entries[nid - start_nid] = raw_ne; - } else { - nat_in_journal(sum, offset) = raw_ne; - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); + } else { + raw_ne = &nat_blk->entries[nid - start_nid]; + } + raw_nat_from_node_info(raw_ne, &ne->ni); - if (nat_get_blkaddr(ne) == NULL_ADDR && + if (nat_get_blkaddr(ne) == NULL_ADDR && add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); + write_lock(&nm_i->nat_tree_lock); + __del_from_nat_cache(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } else { + write_lock(&nm_i->nat_tree_lock); + __clear_nat_cache_dirty(nm_i, ne); + write_unlock(&nm_i->nat_tree_lock); + } } + + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); + + release_nat_entry_set(nes, nm_i); } - if (!flushed) - mutex_unlock(&curseg->curseg_mutex); - f2fs_put_page(page, 1); + + f2fs_bug_on(!list_empty(head)); + f2fs_bug_on(nm_i->dirty_nat_cnt); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1896,6 +1979,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); INIT_LIST_HEAD(&nm_i->dirty_nat_entries); + INIT_LIST_HEAD(&nm_i->nat_entry_set); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); @@ -1976,19 +2060,30 @@ int __init create_node_manager_caches(void) nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) - return -ENOMEM; + goto fail; free_nid_slab = f2fs_kmem_cache_create("free_nid", sizeof(struct free_nid)); - if (!free_nid_slab) { - kmem_cache_destroy(nat_entry_slab); - return -ENOMEM; - } + if (!free_nid_slab) + goto destory_nat_entry; + + nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + sizeof(struct nat_entry_set)); + if (!nat_entry_set_slab) + goto destory_free_nid; return 0; + +destory_free_nid: + kmem_cache_destroy(free_nid_slab); +destory_nat_entry: + kmem_cache_destroy(nat_entry_slab); +fail: + return -ENOMEM; } void destroy_node_manager_caches(void) { + kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7281112cd1c8..8a116a407599 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -89,6 +89,13 @@ enum mem_type { DIRTY_DENTS /* indicates dirty dentry pages */ }; +struct nat_entry_set { + struct list_head set_list; /* link with all nat sets */ + struct list_head entry_list; /* link with dirty nat entries */ + nid_t start_nid; /* start nid of nats in set */ + unsigned int entry_cnt; /* the # of nat entries in set */ +}; + /* * For free nid mangement */ From b434babf854eab82397759e98425423cd9bc4786 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Jun 2014 18:39:15 +0200 Subject: [PATCH 07/40] f2fs: replace count*size kzalloc by kcalloc kcalloc manages count*sizeof overflow. Cc: Jaegeuk Kim Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Fabian Frederick Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d04613df710a..a4f8375f6ae6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1703,7 +1703,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) struct curseg_info *array; int i; - array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); + array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL); if (!array) return -ENOMEM; From 1256010ab1a372c8ffc01fc37ff767d8bc15378b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 24 Jun 2014 14:16:24 +0800 Subject: [PATCH 08/40] f2fs: reduce region of f2fs_lock_op covered for better concurrency In our rename process, region of f2fs_lock_op covered is too big as some of the code like f2fs_empty_dir/f2fs_find_entry are not needed to protect by this lock. So in the extreme case like doing checkpoint when we rename old inode to exist inode in a large directory could cause lower concurrency. Let's reduce the region of f2fs_lock_op to fix this. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 89b0bd9420cc..1b3cae03c24f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -387,8 +387,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - f2fs_lock_op(sbi); - if (new_inode) { err = -ENOTEMPTY; @@ -401,6 +399,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); if (err) goto put_out_dir; @@ -429,9 +429,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_lock_op(sbi); + err = f2fs_add_link(new_dentry, old_inode); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out_dir; + } if (old_dir_entry) { inc_nlink(new_dir); @@ -466,6 +470,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; put_out_dir: + f2fs_unlock_op(sbi); kunmap(new_page); f2fs_put_page(new_page, 0); out_dir: @@ -473,7 +478,6 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - f2fs_unlock_op(sbi); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); From 34e6d456da4a61f655655a6d431657da5753913f Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 24 Jun 2014 18:18:14 +0800 Subject: [PATCH 09/40] f2fs: remove the redundant validation check of acl kernel side(xx_init_acl), the acl is get/cloned from the parent dir's, which is credible. So remove the redundant validation check of acl here. Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index dbe2141d10ad..83b9b5a8d112 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type, size_t size = 0; int error; - if (acl) { - error = posix_acl_valid(acl); - if (error < 0) - return error; - } - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; From 1c3bb97899c91d420562c51882ee81a9d7c35306 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 27 Jun 2014 17:57:04 +0800 Subject: [PATCH 10/40] f2fs: remove the needless point-cast Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 087b03dbca9b..bcbfbc467f46 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -298,14 +298,13 @@ static int make_empty_dir(struct inode *inode, struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; - void *kaddr; dentry_page = get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); @@ -323,7 +322,7 @@ static int make_empty_dir(struct inode *inode, test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); @@ -570,14 +569,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - void *kaddr = page_address(page); int i; lock_page(page); f2fs_wait_on_page_writeback(page, DATA); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; + dentry_blk = page_address(page); + bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); @@ -632,7 +630,6 @@ bool f2fs_empty_dir(struct inode *dir) unsigned long nblock = dir_blocks(dir); for (bidx = 0; bidx < nblock; bidx++) { - void *kaddr; dentry_page = get_lock_data_page(dir, bidx); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) @@ -641,8 +638,8 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; + + dentry_blk = kmap_atomic(dentry_page); if (bidx == 0) bit_pos = 2; else @@ -650,7 +647,7 @@ bool f2fs_empty_dir(struct inode *dir) bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); - kunmap_atomic(kaddr); + kunmap_atomic(dentry_blk); f2fs_put_page(dentry_page, 1); From eee6160f2e324f0f6a626159fa1120caad31a6be Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 24 Jun 2014 18:21:23 +0800 Subject: [PATCH 11/40] f2fs: arguments cleanup of finding file flow functions Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 28 +++++++++++++--------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/hash.c | 4 +++- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bcbfbc467f46..e84e8807c651 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(const char *name, size_t namelen, - f2fs_hash_t namehash, struct f2fs_dir_entry *de) +static bool early_match_name(size_t namelen, f2fs_hash_t namehash, + struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) return false; @@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, size_t namelen, int *max_slots, + struct qstr *name, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, continue; } de = &dentry_blk->dentry[bit_pos]; - if (early_match_name(name, namelen, namehash, de)) { + if (early_match_name(name->len, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], - name, namelen)) { + name->name, + name->len)) { *res_page = dentry_page; goto found; } @@ -132,10 +133,10 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, size_t namelen, + unsigned int level, struct qstr *name, f2fs_hash_t namehash, struct page **res_page) { - int s = GET_DENTRY_SLOTS(namelen); + int s = GET_DENTRY_SLOTS(name->len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; @@ -160,8 +161,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, continue; } - de = find_in_block(dentry_page, name, namelen, - &max_slots, namehash, res_page); + de = find_in_block(dentry_page, name, &max_slots, + namehash, res_page); if (de) break; @@ -187,8 +188,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { - const char *name = child->name; - size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -200,12 +199,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, *res_page = NULL; - name_hash = f2fs_dentry_hash(name, namelen); + name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, name, - namelen, name_hash, res_page); + de = find_in_level(dir, level, child, name_hash, res_page); if (de) break; } @@ -460,7 +458,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, int err = 0; int i; - dentry_hash = f2fs_dentry_hash(name->name, name->len); + dentry_hash = f2fs_dentry_hash(name); level = 0; current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ec480b1a6e33..ae3b4ac7a254 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1158,7 +1158,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, size_t); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 6eb8d269b53b..948d17bf7281 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -69,12 +69,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; const char *p; __u32 in[8], buf[4]; + const char *name = name_info->name; + size_t len = name_info->len; if ((len <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) From 3aab8f828ef358ae92545294a14cd52d510cc585 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Jul 2014 13:25:04 +0800 Subject: [PATCH 12/40] f2fs: introduce f2fs_write_failed to handle error case when write When we fail in ->write_begin()/->direct_IO(), our allocated node block in disk and page cache are still kept, despite these may not be used again. This patch introduce f2fs_write_failed() to handle the error case of these two interfaces, it will truncate page cache and blocks of this file according to i_size. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2eb2764173ca..05154d6de49a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -914,6 +914,16 @@ skip_write: return 0; } +static void f2fs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + truncate_blocks(inode, inode->i_size); + } +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -931,11 +941,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: err = f2fs_convert_inline_data(inode, pos + len); if (err) - return err; + goto fail; page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto fail; + } /* to avoid latency during memory pressure */ unlock_page(page); @@ -949,10 +961,9 @@ repeat: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); - if (err) { f2fs_put_page(page, 0); - return err; + goto fail; } inline_data: lock_page(page); @@ -982,19 +993,20 @@ inline_data: err = f2fs_read_inline_data(inode, page); if (err) { page_cache_release(page); - return err; + goto fail; } } else { err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) - return err; + goto fail; } lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return -EIO; + err = -EIO; + goto fail; } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); @@ -1005,6 +1017,9 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; +fail: + f2fs_write_failed(mapping, pos + len); + return err; } static int f2fs_write_end(struct file *file, @@ -1049,7 +1064,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + int err; /* Let buffer I/O handle the inline data case. */ if (f2fs_has_inline_data(inode)) @@ -1061,8 +1079,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, /* clear fsync mark to recover these blocks */ fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); - return blockdev_direct_IO(rw, iocb, inode, iter, offset, - get_data_block); + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); + if (err < 0 && (rw & WRITE)) + f2fs_write_failed(mapping, offset + count); + return err; } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, From 6b2920a513ec9972f7b80d219fcf6f59130a1f31 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 7 Jul 2014 11:21:59 +0800 Subject: [PATCH 13/40] f2fs: use inner macro and function to clean up codes In this patch we use below inner macro and function to clean up codes. 1. ADDRS_PER_PAGE 2. SM_I 3. f2fs_readonly Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +-- fs/f2fs/segment.c | 9 ++++----- fs/f2fs/super.c | 7 +++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 36fa50587bb8..7c652b3e1511 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -272,8 +272,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a4f8375f6ae6..8a6e57d83c79 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,13 +272,13 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); - sbi->sm_info->cmd_control_info = fcc; + SM_I(sbi)->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; return err; } @@ -287,13 +287,12 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = - sbi->sm_info->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; if (fcc && fcc->f2fs_issue_flush) kthread_stop(fcc->f2fs_issue_flush); kfree(fcc); - sbi->sm_info->cmd_control_info = NULL; + SM_I(sbi)->cmd_control_info = NULL; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8f96d9372ade..870fe199bafb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -615,7 +615,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; /* @@ -642,8 +642,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { destroy_flush_cmd_control(sbi); - } else if (test_opt(sbi, FLUSH_MERGE) && - !sbi->sm_info->cmd_control_info) { + } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -1082,7 +1081,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!(sb->s_flags & MS_RDONLY)) { + if (!f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) From 81e366f87f52c62671fb1d8050572e68dbcf1d22 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 10 Jul 2014 12:37:46 +0800 Subject: [PATCH 14/40] f2fs: check name_len of dir entry to prevent from deadloop We assume that modification of some special application could result in zeroed name_len, or it is consciously made by somebody. We will deadloop in find_in_block when name_len of dir entry is zero. This patch is added for preventing deadloop in above scenario. change log from v1: o use f2fs_bug_on rather than break out from searching dir entry suggested by Jaegeuk Kim. Jaegeuk describe: "Well, IMO, it would be good to add f2fs_bug_on() here with a specific comment. In the current phase of f2fs, it is more important to investigate the file system bugs, rather than workarounds for any corrupted images. And, definitely it needs to stop the kernel if any corrupted image was mounted, so that we can figure out where the bugs are occurred." Suggested-by: Jaegeuk Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e84e8807c651..bcf893c3d903 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -121,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, *max_slots = max_len; max_len = 0; } + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs are occurred. + */ + f2fs_bug_on(!de->name_len); + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } From 7a6c76b1b2f2c9555571647f07fc7ccada0a1b0a Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 11 Jul 2014 18:35:43 +0800 Subject: [PATCH 15/40] f2fs: cleanup the needless return of f2fs_create_root_stats Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b52c12cf5873..3f99266ccc17 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void) f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - goto bail; + return; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); - if (!file) - goto free_debugfs_dir; - - return; - -free_debugfs_dir: - debugfs_remove(f2fs_debugfs_root); - -bail: - f2fs_debugfs_root = NULL; - return; + if (!file) { + debugfs_remove(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; + } } void f2fs_destroy_root_stats(void) From 4b2868aa4f07eeb64e96c2e0823387c426eca356 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Fri, 11 Jul 2014 18:35:44 +0800 Subject: [PATCH 16/40] f2fs: remove the unused stat_lock Signed-off-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ae3b4ac7a254..8f507d4c0f71 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1298,7 +1298,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *); struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; - struct mutex stat_lock; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; int hit_ext, total_ext; From f1121ab0ba9ab9c1592049533b511877600f409e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Jul 2014 16:45:15 +0800 Subject: [PATCH 17/40] f2fs: reduce searching region of segmap when free section In __set_test_and_free we will check whether all segment are free in one section When free one segment, in order to set section to free status. But the searching region of segmap is from start segno to last segno of f2fs, it's not necessary. So let's just only check all segment bitmap of target section. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7091204680f4..ee5c75e08d9c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -347,8 +347,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), - start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; From 79e35dc3c23dd2ac9f8681361026c82b71a0b006 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 12 Jul 2014 20:10:00 +0800 Subject: [PATCH 18/40] f2fs: add f2fs_balance_fs for direct IO Otherwise, if a large amount of direct IO writes were done, the segment allocation may be failed because no enough segments are gced. Changes: v2: add f2fs_balance_fs into __get_data_block instead of f2fs_direct_IO. Signed-off-by: Huang, Ying Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 05154d6de49a..c77c66723d70 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -626,8 +626,10 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (check_extent_cache(inode, pgofs, bh_result)) goto out; - if (create) + if (create) { + f2fs_balance_fs(sbi); f2fs_lock_op(sbi); + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); From 32f9bc25cbda00410e2379c58ae027e88bf24770 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 12 Jul 2014 19:13:54 +0800 Subject: [PATCH 19/40] f2fs: support ->rename2() Now new interface ->rename2() is added to VFS, here are related description: https://lkml.org/lkml/2014/2/7/873 https://lkml.org/lkml/2014/2/7/758 This patch adds function f2fs_rename2() to support ->rename2() including handling both RENAME_EXCHANGE and RENAME_NOREPLACE flag. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1b3cae03c24f..27b03776ffd2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -485,6 +485,169 @@ out: return err; } +static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct super_block *sb = old_dir->i_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *old_dir_page, *new_dir_page; + struct page *old_page, *new_page; + struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; + struct f2fs_dir_entry *old_entry, *new_entry; + int old_nlink = 0, new_nlink = 0; + int err = -ENOENT; + + f2fs_balance_fs(sbi); + + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (!old_entry) + goto out; + + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + if (!new_entry) + goto out_old; + + /* prepare for updating ".." directory entry info later */ + if (old_dir != new_dir) { + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + old_dir_entry = f2fs_parent_dir(old_inode, + &old_dir_page); + if (!old_dir_entry) + goto out_new; + } + + if (S_ISDIR(new_inode->i_mode)) { + err = -EIO; + new_dir_entry = f2fs_parent_dir(new_inode, + &new_dir_page); + if (!new_dir_entry) + goto out_old_dir; + } + } + + /* + * If cross rename between file and directory those are not + * in the same directory, we will inc nlink of file's parent + * later, so we should check upper boundary of its nlink. + */ + if ((!old_dir_entry || !new_dir_entry) && + old_dir_entry != new_dir_entry) { + old_nlink = old_dir_entry ? -1 : 1; + new_nlink = -old_nlink; + err = -EMLINK; + if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + goto out_new_dir; + } + + f2fs_lock_op(sbi); + + err = update_dent_inode(old_inode, &new_dentry->d_name); + if (err) + goto out_unlock; + + err = update_dent_inode(new_inode, &old_dentry->d_name); + if (err) + goto out_undo; + + /* update ".." directory entry info of old dentry */ + if (old_dir_entry) + f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + + /* update ".." directory entry info of new dentry */ + if (new_dir_entry) + f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + + /* update directory entry info of old dir inode */ + f2fs_set_link(old_dir, old_entry, old_page, new_inode); + + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + + update_inode_page(old_inode); + + old_dir->i_ctime = CURRENT_TIME; + if (old_nlink) { + down_write(&F2FS_I(old_dir)->i_sem); + if (old_nlink < 0) + drop_nlink(old_dir); + else + inc_nlink(old_dir); + up_write(&F2FS_I(old_dir)->i_sem); + } + mark_inode_dirty(old_dir); + update_inode_page(old_dir); + + /* update directory entry info of new dir inode */ + f2fs_set_link(new_dir, new_entry, new_page, old_inode); + + down_write(&F2FS_I(new_inode)->i_sem); + file_lost_pino(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + update_inode_page(new_inode); + + new_dir->i_ctime = CURRENT_TIME; + if (new_nlink) { + down_write(&F2FS_I(new_dir)->i_sem); + if (new_nlink < 0) + drop_nlink(new_dir); + else + inc_nlink(new_dir); + up_write(&F2FS_I(new_dir)->i_sem); + } + mark_inode_dirty(new_dir); + update_inode_page(new_dir); + + f2fs_unlock_op(sbi); + return 0; +out_undo: + /* Still we may fail to recover name info of f2fs_inode here */ + update_dent_inode(old_inode, &old_dentry->d_name); +out_unlock: + f2fs_unlock_op(sbi); +out_new_dir: + if (new_dir_entry) { + kunmap(new_dir_page); + f2fs_put_page(new_dir_page, 0); + } +out_old_dir: + if (old_dir_entry) { + kunmap(old_dir_page); + f2fs_put_page(old_dir_page, 0); + } +out_new: + kunmap(new_page); + f2fs_put_page(new_page, 0); +out_old: + kunmap(old_page); + f2fs_put_page(old_page, 0); +out: + return err; +} + +static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) { + return f2fs_cross_rename(old_dir, old_dentry, + new_dir, new_dentry); + } + /* + * VFS has already handled the new dentry existence case, + * here, we just deal with "RENAME_NOREPLACE" as regular rename. + */ + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); @@ -542,6 +705,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .rename2 = f2fs_rename2, .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, .setattr = f2fs_setattr, From dbf20cb259e879e2d939fd3fd5c792732d845195 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Jul 2014 12:00:57 +0800 Subject: [PATCH 20/40] f2fs: avoid use invalid mapping of node_inode when evict meta inode Andrey Tsyvarev reported: "Using memory error detector reveals the following use-after-free error in 3.15.0: AddressSanitizer: heap-use-after-free in f2fs_evict_inode Read of size 8 by thread T22279: [] f2fs_evict_inode+0x102/0x2e0 [f2fs] [] evict+0x15f/0x290 [< inlined >] iput+0x196/0x280 iput_final [] iput+0x196/0x280 [] f2fs_put_super+0xd6/0x170 [f2fs] [] generic_shutdown_super+0xc5/0x1b0 [] kill_block_super+0x4d/0xb0 [] deactivate_locked_super+0x66/0x80 [] deactivate_super+0x68/0x80 [] mntput_no_expire+0x198/0x250 [< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount [] SyS_umount+0xe9/0x1a0 [] system_call_fastpath+0x16/0x1b Freed by thread T3: [] f2fs_i_callback+0x27/0x30 [f2fs] [< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim [< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch [< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks [< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks [] rcu_process_callbacks+0x2d6/0x930 [] __do_softirq+0x142/0x380 [] run_ksoftirqd+0x30/0x50 [] smpboot_thread_fn+0x197/0x280 [] kthread+0x148/0x160 [] ret_from_fork+0x7c/0xb0 Allocated by thread T22276: [] f2fs_alloc_inode+0x2d/0x170 [f2fs] [] iget_locked+0x10a/0x230 [] f2fs_iget+0x35/0xa80 [f2fs] [] f2fs_fill_super+0xb53/0xff0 [f2fs] [] mount_bdev+0x1de/0x240 [] f2fs_mount+0x10/0x20 [f2fs] [] mount_fs+0x55/0x220 [] vfs_kern_mount+0x66/0x200 [< inlined >] do_mount+0x2b4/0x1120 do_new_mount [] do_mount+0x2b4/0x1120 [< inlined >] SyS_mount+0xb2/0x110 SYSC_mount [] SyS_mount+0xb2/0x110 [] system_call_fastpath+0x16/0x1b The buggy address ffff8800587866c8 is located 48 bytes inside of 680-byte region [ffff880058786698, ffff880058786940) Memory state around the buggy address: ffff880058786100: ffffffff ffffffff ffffffff ffffffff ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff ffff880058786400: ffffffff ffffffff ffffffff ffffffff ffff880058786500: ffffffff ffffffff ffffffff fffffffr >ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff ^ ffff880058786700: ffffffff ffffffff ffffffff ffffffff ffff880058786800: ffffffff ffffffff ffffffff ffffffff ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr.... ffff880058786a00: ........ ........ ........ ........ ffff880058786b00: ........ ........ ........ ........ Legend: f - 8 freed bytes r - 8 redzone bytes . - 8 allocated bytes x=1..7 - x allocated bytes + (8-x) redzone bytes Investigation shows, that f2fs_evict_inode, when called for 'meta_inode', uses invalidate_mapping_pages() for 'node_inode'. But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via iput(). It seems that in common usage scenario this use-after-free is benign, because 'node_inode' remains partially valid data even after kmem_cache_free(). But things may change if, while 'meta_inode' is evicted in one f2fs filesystem, another (mounted) f2fs filesystem requests inode from cache, and formely 'node_inode' of the first filesystem is returned." Nids for both meta_inode and node_inode are reservation, so it's not necessary for us to invalidate pages which will never be allocated. To fix this issue, let's skipping needlessly invalidating pages for {meta,node}_inode in f2fs_evict_inode. Reported-by: Andrey Tsyvarev Tested-by: Andrey Tsyvarev Signed-off-by: Gu Zheng Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2cf6962f6cc8..cafba3c9f8d8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -273,7 +273,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) - goto no_delete; + goto out_clear; f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); @@ -295,6 +295,7 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: - clear_inode(inode); invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); +out_clear: + clear_inode(inode); } From 9d847950770da7102d4efc02d0939ce28e3a7dd0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Jul 2014 12:55:09 +0800 Subject: [PATCH 21/40] f2fs: fix to put root inode in error path of fill_super We should put root inode correctly in error path of fill_super, otherwise we may encounter a leak case of inode resource. Signed-off-by: Chao Yu Reviewed-by: Gu Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 870fe199bafb..34649aa66e04 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1033,8 +1033,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); err = -EINVAL; - goto free_root_inode; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ From 0f7b2abd188089a44f60e2bf8521d1363ada9e12 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Jul 2014 09:57:31 -0700 Subject: [PATCH 22/40] f2fs: add nobarrier mount option This patch adds a mount option, nobarrier, in f2fs. The assumption in here is that file system keeps the IO ordering, but doesn't care about cache flushes inside the storages. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 5 +++++ fs/f2fs/data.c | 5 ++++- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 3 +++ fs/f2fs/super.c | 7 +++++++ 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 51afba17bbae..a2046a7d0a9d 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -126,6 +126,11 @@ flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, recommend to enable this option. +nobarrier This option can be used if underlying storage guarantees + its cached data should be written to the novolatile area. + If this option is set, no cache_flush commands are issued + but f2fs still guarantees the write ordering of all the + data writes. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c77c66723d70..482313dd9e24 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + if (test_opt(sbi, NOBARRIER)) + io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + else + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); up_write(&io->io_rwsem); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f507d4c0f71..e999eec200b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,6 +41,7 @@ #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 #define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8a6e57d83c79..9fce0f47eb35 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -239,6 +239,9 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; struct flush_cmd cmd; + if (test_opt(sbi, NOBARRIER)) + return 0; + if (!test_opt(sbi, FLUSH_MERGE)) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 34649aa66e04..93593ceec175 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -52,6 +52,7 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_flush_merge, + Opt_nobarrier, Opt_err, }; @@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_flush_merge, "flush_merge"}, + {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL}, }; @@ -339,6 +341,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_nobarrier: + set_opt(sbi, NOBARRIER); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -544,6 +549,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_data"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); + if (test_opt(sbi, NOBARRIER)) + seq_puts(seq, ",nobarrier"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; From 953e6cc6bcb615dfa373320ffa62b574c6be608a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 15:47:16 -0700 Subject: [PATCH 23/40] f2fs: punch the core function for inode management This patch punches out the core functions to manage the inode numbers. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 81 ++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0b4710c1d370..1c6eb6bcfd99 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -282,6 +282,47 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct list_head *head; + struct orphan_inode_entry *new, *e; + + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new->ino = ino; + + spin_lock(&sbi->orphan_inode_lock); + list_for_each_entry(e, &sbi->orphan_inode_list, list) { + if (e->ino == ino) { + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, new); + return; + } + if (e->ino > ino) + break; + } + + /* add new entry into list which is sorted by inode number */ + list_add_tail(&new->list, &e->list); + spin_unlock(&sbi->orphan_inode_lock); +} + +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct orphan_inode_entry *e; + + spin_lock(&sbi->orphan_inode_lock); + list_for_each_entry(e, &sbi->orphan_inode_list, list) { + if (e->ino == ino) { + list_del(&e->list); + sbi->n_orphans--; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, e); + return; + } + } + spin_unlock(&sbi->orphan_inode_lock); +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; @@ -306,48 +347,14 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *new, *orphan; - - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, new); - return; - } - - if (orphan->ino > ino) - break; - } - /* add new orphan entry into list which is sorted by inode number */ - list_add_tail(&new->list, &orphan->list); - spin_unlock(&sbi->orphan_inode_lock); + __add_ino_entry(sbi, ino); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head; - struct orphan_inode_entry *orphan; - - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; - list_for_each_entry(orphan, head, list) { - if (orphan->ino == ino) { - list_del(&orphan->list); - f2fs_bug_on(sbi->n_orphans == 0); - sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, orphan); - return; - } - } - spin_unlock(&sbi->orphan_inode_lock); + /* remove orphan entry from orphan list */ + __remove_ino_entry(sbi, ino); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) From 6451e041c8d39daf39c71eefe839641c2093713e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 15:47:17 -0700 Subject: [PATCH 24/40] f2fs: add infra for ino management This patch changes the naming of orphan-related data structures to use as inode numbers managed globally. Later, we can use this facility for managing any inode number lists. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 76 +++++++++++++++++++++++--------------------- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 19 +++++++---- fs/f2fs/super.c | 2 +- 4 files changed, 55 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1c6eb6bcfd99..f93d154e2770 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -22,7 +22,7 @@ #include "segment.h" #include -static struct kmem_cache *orphan_entry_slab; +static struct kmem_cache *ino_entry_slab; static struct kmem_cache *inode_entry_slab; /* @@ -282,19 +282,18 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; -static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { - struct list_head *head; - struct orphan_inode_entry *new, *e; + struct ino_entry *new, *e; - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); new->ino = ino; - spin_lock(&sbi->orphan_inode_lock); - list_for_each_entry(e, &sbi->orphan_inode_list, list) { + spin_lock(&sbi->ino_lock[type]); + list_for_each_entry(e, &sbi->ino_list[type], list) { if (e->ino == ino) { - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, new); + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, new); return; } if (e->ino > ino) @@ -303,58 +302,58 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino) /* add new entry into list which is sorted by inode number */ list_add_tail(&new->list, &e->list); - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[type]); } -static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino) +static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { - struct orphan_inode_entry *e; + struct ino_entry *e; - spin_lock(&sbi->orphan_inode_lock); - list_for_each_entry(e, &sbi->orphan_inode_list, list) { + spin_lock(&sbi->ino_lock[type]); + list_for_each_entry(e, &sbi->ino_list[type], list) { if (e->ino == ino) { list_del(&e->list); sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); - kmem_cache_free(orphan_entry_slab, e); + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); return; } } - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[type]); } int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->orphan_inode_lock); + spin_lock(&sbi->ino_lock[ORPHAN_INO]); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* add new orphan entry into list which is sorted by inode number */ - __add_ino_entry(sbi, ino); + __add_ino_entry(sbi, ino, ORPHAN_INO); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ - __remove_ino_entry(sbi, ino); + __remove_ino_entry(sbi, ino, ORPHAN_INO); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -408,14 +407,14 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); struct page *page = NULL; - struct orphan_inode_entry *orphan = NULL; + struct ino_entry *orphan = NULL; for (index = 0; index < orphan_blocks; index++) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&sbi->orphan_inode_lock); - head = &sbi->orphan_inode_list; + spin_lock(&sbi->ino_lock[ORPHAN_INO]); + head = &sbi->ino_list[ORPHAN_INO]; /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { @@ -455,7 +454,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) f2fs_put_page(page, 1); } - spin_unlock(&sbi->orphan_inode_lock); + spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -939,31 +938,36 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } -void init_orphan_info(struct f2fs_sb_info *sbi) +void init_ino_entry_info(struct f2fs_sb_info *sbi) { - spin_lock_init(&sbi->orphan_inode_lock); - INIT_LIST_HEAD(&sbi->orphan_inode_list); - sbi->n_orphans = 0; + int i; + + for (i = 0; i < MAX_INO_ENTRY; i++) { + spin_lock_init(&sbi->ino_lock[i]); + INIT_LIST_HEAD(&sbi->ino_list[i]); + } + /* * considering 512 blocks in a segment 8 blocks are needed for cp * and log segment summaries. Remaining blocks are used to keep * orphan entries with the limitation one reserved segment * for cp pack we can have max 1020*504 orphan entries */ + sbi->n_orphans = 0; sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { - orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry)); - if (!orphan_entry_slab) + ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", + sizeof(struct ino_entry)); + if (!ino_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry)); if (!inode_entry_slab) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; @@ -971,6 +975,6 @@ int __init create_checkpoint_caches(void) void destroy_checkpoint_caches(void) { - kmem_cache_destroy(orphan_entry_slab); + kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(inode_entry_slab); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 3f99266ccc17..a441ba33be11 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -167,7 +167,7 @@ get_cache: si->cache_mem += npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); + si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e999eec200b7..b6fa6ec54f98 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -100,8 +100,13 @@ enum { META_SSA }; -/* for the list of orphan inodes */ -struct orphan_inode_entry { +/* for the list of ino */ +enum { + ORPHAN_INO, /* for orphan ino list */ + MAX_INO_ENTRY, /* max. list */ +}; + +struct ino_entry { struct list_head list; /* list head */ nid_t ino; /* inode number */ }; @@ -450,9 +455,11 @@ struct f2fs_sb_info { bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; - /* for orphan inode management */ - struct list_head orphan_inode_list; /* orphan inode list */ - spinlock_t orphan_inode_lock; /* for orphan inode list */ + /* for inode management */ + spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ + struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ + + /* for orphan inode, use 0'th array */ unsigned int n_orphans; /* # of orphan inodes */ unsigned int max_orphans; /* max orphan inodes */ @@ -1255,7 +1262,7 @@ void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); -void init_orphan_info(struct f2fs_sb_info *); +void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 93593ceec175..f253e222dcea 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1003,7 +1003,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - init_orphan_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); From 39efac41fbe44343cac29472320a1d502fcff66b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jul 2014 18:15:17 -0700 Subject: [PATCH 25/40] f2fs: use radix_tree for ino management For better ino management, this patch replaces the data structure from list to radix tree. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 53 ++++++++++++++++++++++++-------------------- fs/f2fs/f2fs.h | 1 + 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f93d154e2770..4bf203756cf8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -284,24 +284,27 @@ const struct address_space_operations f2fs_meta_aops = { static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { - struct ino_entry *new, *e; - - new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); - new->ino = ino; - + struct ino_entry *e; +retry: spin_lock(&sbi->ino_lock[type]); - list_for_each_entry(e, &sbi->ino_list[type], list) { - if (e->ino == ino) { - spin_unlock(&sbi->ino_lock[type]); - kmem_cache_free(ino_entry_slab, new); - return; - } - if (e->ino > ino) - break; - } - /* add new entry into list which is sorted by inode number */ - list_add_tail(&new->list, &e->list); + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (!e) { + e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); + if (!e) { + spin_unlock(&sbi->ino_lock[type]); + goto retry; + } + if (radix_tree_insert(&sbi->ino_root[type], ino, e)) { + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + goto retry; + } + memset(e, 0, sizeof(struct ino_entry)); + e->ino = ino; + + list_add_tail(&e->list, &sbi->ino_list[type]); + } spin_unlock(&sbi->ino_lock[type]); } @@ -310,14 +313,15 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) struct ino_entry *e; spin_lock(&sbi->ino_lock[type]); - list_for_each_entry(e, &sbi->ino_list[type], list) { - if (e->ino == ino) { - list_del(&e->list); + e = radix_tree_lookup(&sbi->ino_root[type], ino); + if (e) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[type], ino); + if (type == ORPHAN_INO) sbi->n_orphans--; - spin_unlock(&sbi->ino_lock[type]); - kmem_cache_free(ino_entry_slab, e); - return; - } + spin_unlock(&sbi->ino_lock[type]); + kmem_cache_free(ino_entry_slab, e); + return; } spin_unlock(&sbi->ino_lock[type]); } @@ -346,7 +350,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - /* add new orphan entry into list which is sorted by inode number */ + /* add new orphan ino entry into list */ __add_ino_entry(sbi, ino, ORPHAN_INO); } @@ -943,6 +947,7 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) int i; for (i = 0; i < MAX_INO_ENTRY; i++) { + INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC); spin_lock_init(&sbi->ino_lock[i]); INIT_LIST_HEAD(&sbi->ino_list[i]); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b6fa6ec54f98..4454caa8a253 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -456,6 +456,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; /* for inode management */ + struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */ spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ From fff04f90c1b9f91b9c513a89702a4b9ffe5dc1c5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 07:40:59 -0700 Subject: [PATCH 26/40] f2fs: add info of appended or updated data writes This patch introduces a inode number list in which represents inodes having appended data writes or updated data writes after last checkpoint. This will be used at fsync to determine whether the recovery information should be written or not. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/inline.c | 1 + fs/f2fs/inode.c | 4 ++++ 5 files changed, 53 insertions(+) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4bf203756cf8..430163d8c806 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -326,6 +326,44 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&sbi->ino_lock[type]); } +void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* add new dirty ino entry into list */ + __add_ino_entry(sbi, ino, type); +} + +void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +{ + /* remove dirty ino entry from list */ + __remove_ino_entry(sbi, ino, type); +} + +/* mode should be APPEND_INO or UPDATE_INO */ +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) +{ + struct ino_entry *e; + spin_lock(&sbi->ino_lock[mode]); + e = radix_tree_lookup(&sbi->ino_root[mode], ino); + spin_unlock(&sbi->ino_lock[mode]); + return e ? true : false; +} + +static void release_dirty_inode(struct f2fs_sb_info *sbi) +{ + struct ino_entry *e, *tmp; + int i; + + for (i = APPEND_INO; i <= UPDATE_INO; i++) { + spin_lock(&sbi->ino_lock[i]); + list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) { + list_del(&e->list); + radix_tree_delete(&sbi->ino_root[i], e->ino); + kmem_cache_free(ino_entry_slab, e); + } + spin_unlock(&sbi->ino_lock[i]); + } +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { int err = 0; @@ -897,6 +935,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { clear_prefree_segments(sbi); + release_dirty_inode(sbi); F2FS_RESET_SB_DIRT(sbi); } } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 482313dd9e24..ec3c8860539e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -789,9 +789,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) !is_cold_data(page) && need_inplace_update(inode))) { rewrite_data_page(page, old_blkaddr, fio); + set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); } else { write_data_page(page, &dn, &new_blkaddr, fio); update_extent_cache(new_blkaddr, &dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); } out_writepage: f2fs_put_dnode(&dn); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4454caa8a253..ab3602576fb2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,6 +103,8 @@ enum { /* for the list of ino */ enum { ORPHAN_INO, /* for orphan ino list */ + APPEND_INO, /* for append ino list */ + UPDATE_INO, /* for update ino list */ MAX_INO_ENTRY, /* max. list */ }; @@ -994,6 +996,8 @@ enum { FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1252,6 +1256,9 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); +void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1bba5228c197..5beeccef9ae1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -172,6 +172,7 @@ int f2fs_write_inline_data(struct inode *inode, stat_inc_inline_inode(inode); } + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); sync_inode_page(&dn); f2fs_put_dnode(&dn); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cafba3c9f8d8..0e69aa90238d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -296,6 +296,10 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) + add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) + add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); out_clear: clear_inode(inode); } From 6d99ba41a72c1213f2029e6fc66aa7943339b5db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jul 2014 19:08:02 -0700 Subject: [PATCH 27/40] f2fs: skip unnecessary data writes during fsync This patch intends to improve the fsync performance by skipping remaining the recovery information, only when there is no data that we should recover. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7c652b3e1511..9b888fbf08b6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -133,6 +133,17 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return ret; } + /* + * if there is no written data, don't waste time to write recovery info. + */ + if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + exist_written_data(sbi, inode->i_ino, UPDATE_INO)) + goto flush_out; + goto out; + } + /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -188,6 +199,13 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_on_node_pages_writeback(sbi, inode->i_ino); if (ret) goto out; + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: From ea1aa12ca237149227ef68af50c9a1acf027b625 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Jul 2014 19:11:43 -0700 Subject: [PATCH 28/40] f2fs: enable in-place-update for fdatasync This patch enforces in-place-updates only when fdatasync is requested. If we adopt this in-place-updates for the fdatasync, we can skip to write the recovery information. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 7 +++++++ fs/f2fs/segment.h | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ab3602576fb2..8f8685e16863 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -998,6 +998,7 @@ enum { FI_INLINE_DATA, /* used for inline data*/ FI_APPEND_WRITE, /* inode has appended data */ FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used fo ipu for fdatasync */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9b888fbf08b6..95501358932c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -127,7 +127,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return 0; trace_f2fs_sync_file_enter(inode); + + /* if fdatasync is triggered, let's do in-place-update */ + if (datasync) + set_inode_flag(fi, FI_NEED_IPU); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (datasync) + clear_inode_flag(fi, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee5c75e08d9c..55973f7b0330 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -486,6 +486,10 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode)) return false; + /* this is only set during fdatasync */ + if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + return true; + switch (SM_I(sbi)->ipu_policy) { case F2FS_IPU_FORCE: return true; From 01229f5e1b21b378863c91f8c653bbd8e593858c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 07:41:43 -0700 Subject: [PATCH 29/40] f2fs: fix wrong condition for unlikely This patch fixes the wrongly used unlikely condition. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 430163d8c806..26b94bbc826c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -933,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { + if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { clear_prefree_segments(sbi); release_dirty_inode(sbi); F2FS_RESET_SB_DIRT(sbi); From 61e0f2d0a5f2cddf7cd96fa8cb7fe53a1e5e325d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 15:47:23 -0700 Subject: [PATCH 30/40] f2fs: test before set/clear bits If the bit is already set, we don't need to reset it, and vice versa. Because we don't need to make the caches dirty for that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f8685e16863..475f97ca49ae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1003,7 +1003,8 @@ enum { static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) { - set_bit(flag, &fi->flags); + if (!test_bit(flag, &fi->flags)) + set_bit(flag, &fi->flags); } static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) @@ -1013,7 +1014,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) { - clear_bit(flag, &fi->flags); + if (test_bit(flag, &fi->flags)) + clear_bit(flag, &fi->flags); } static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) From cf2271e781cb16e1ca22be920010c2b64d90c338 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 15:47:25 -0700 Subject: [PATCH 31/40] f2fs: avoid retrying wrong recovery routine when error was occurred This patch eliminates the propagation of recovery errors to the next mount. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 ++- fs/f2fs/f2fs.h | 2 +- fs/f2fs/recovery.c | 20 +++++++++++++++++++- fs/f2fs/segment.c | 5 +---- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 26b94bbc826c..cea20b810f44 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -796,6 +796,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); nid_t last_nid = 0; block_t start_blk; struct page *cp_page; @@ -809,7 +810,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - discard_next_dnode(sbi); + discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 475f97ca49ae..14b9f746d5b3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1225,7 +1225,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); -void discard_next_dnode(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a112368a4a86..b2aa53b99f64 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -434,7 +434,9 @@ next: int recover_fsync_data(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + block_t blkaddr; int err; bool need_writecp = false; @@ -447,6 +449,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ sbi->por_doing = true; + + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + err = find_fsync_dnodes(sbi, &inode_list); if (err) goto out; @@ -462,8 +467,21 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); + + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + sbi->por_doing = false; - if (!err && need_writecp) + if (err) { + discard_next_dnode(sbi, blkaddr); + + /* Flush all the NAT/SIT pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) + sync_meta_pages(sbi, META, LONG_MAX); + } else if (need_writecp) { write_checkpoint(sbi, false); + } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9fce0f47eb35..e016b97be2ac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -379,11 +379,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -void discard_next_dnode(struct f2fs_sb_info *sbi) +void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - if (f2fs_issue_discard(sbi, blkaddr, 1)) { struct page *page = grab_meta_page(sbi, blkaddr); /* zero-filled page */ From 24a9ee0fa3d40415765d2d9f6064930d72ad8b5a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 25 Jul 2014 17:46:10 -0700 Subject: [PATCH 32/40] f2fs: add tracepoint for f2fs_issue_flush This patch adds a tracepoint for f2fs_issue_flush. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ include/trace/events/f2fs.h | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e016b97be2ac..5a53a0ac6e4f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -239,6 +239,9 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; struct flush_cmd cmd; + trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE)); + if (test_opt(sbi, NOBARRIER)) return 0; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index b983990b4a9f..7d2e70e75f16 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -926,6 +926,30 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); + +TRACE_EVENT(f2fs_issue_flush, + + TP_PROTO(struct super_block *sb, bool nobarrier, bool flush_merge), + + TP_ARGS(sb, nobarrier, flush_merge), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, nobarrier) + __field(bool, flush_merge) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nobarrier = nobarrier; + __entry->flush_merge = flush_merge; + ), + + TP_printk("dev = (%d,%d), %s %s", + show_dev(__entry), + __entry->nobarrier ? "skip (nobarrier)" : "issue", + __entry->flush_merge ? " with flush_merge" : "") +); #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 33be828ada7274ebcade2001f16e5b4e33a4636e Mon Sep 17 00:00:00 2001 From: Dongho Sim Date: Wed, 30 Jul 2014 06:52:41 +0000 Subject: [PATCH 33/40] f2fs: remove redundant lines in allocate_data_block There are redundant lines in allocate_data_block. In this function, we call refresh_sit_entry with old seg and old curseg. After that, we call locate_dirty_segment with old curseg. But, the new address is always allocated from old curseg and we call locate_dirty_segment with old curseg in refresh_sit_entry. So, we do not need to call locate_dirty_segment with old curseg again. We've discussed like below: Jaegeuk said: "When considering SSR, we need to take care of the following scenario. - old segno : X - new address : Z - old curseg : Y This means, a new block is supposed to be written to Z from X. And Z is newly allocated in the same path from Y. In that case, we should trigger locate_dirty_segment for Y, since it was a current_segment and can be dirty owing to SSR. But that was not included in the dirty list." Changman said: "We already choosed old curseg(Y) and then we allocate new address(Z) from old curseg(Y). After that we call refresh_sit_entry(old address, new address). In the funcation, we call locate_dirty_segment with old seg and old curseg. So calling locate_dirty_segment after refresh_sit_entry again is redundant." Jaegeuk said: "Right. The new address is always allocated from old_curseg." Reviewed-by: Chao Yu Signed-off-by: Dongho Sim Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5a53a0ac6e4f..c3b76d091450 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -976,14 +976,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; - unsigned int old_cursegno; curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - old_cursegno = curseg->segno; /* * __add_sum_entry should be resided under the curseg_mutex @@ -1004,7 +1002,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); mutex_unlock(&sit_i->sentry_lock); From 65b85ccce03f17b3098273192b20885f3fead820 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 30 Jul 2014 17:25:54 -0700 Subject: [PATCH 34/40] f2fs: fix coding style This patch fixes wrong coding style. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 14b9f746d5b3..80c78695546d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -781,7 +781,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else - return ((unsigned char *)ckpt + F2FS_BLKSIZE); + return (unsigned char *)ckpt + F2FS_BLKSIZE; } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 95501358932c..2a91a7d1760c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -231,8 +231,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, /* find first dirty page index */ pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, + PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; pagevec_release(&pvec); return pgofs; } From b3582c68920105e29d219714d8a6fbde25a43379 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 3 Jul 2014 18:58:39 +0800 Subject: [PATCH 35/40] f2fs: reduce competition among node page writes We do not need to block on ->node_write among different node page writers e.g. fsync/flush, unless we have a node page writer from write_checkpoint. So it's better use rw_semaphore instead of mutex type for ->node_write to promote performance. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++--- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 4 ++-- fs/f2fs/super.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cea20b810f44..6aeed5bada52 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -762,10 +762,10 @@ retry_flush_dents: * until finishing nat/sit flush. */ retry_flush_nodes: - mutex_lock(&sbi->node_write); + down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); sync_node_pages(sbi, 0, &wbc); goto retry_flush_nodes; } @@ -774,7 +774,7 @@ retry_flush_nodes: static void unblock_operations(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->node_write); + up_write(&sbi->node_write); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 80c78695546d..99425675c2f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -452,7 +452,7 @@ struct f2fs_sb_info { struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ - struct mutex node_write; /* locking node writes */ + struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a90f51d32482..7b5b5def65fe 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1231,12 +1231,12 @@ static int f2fs_write_node_page(struct page *page, if (wbc->for_reclaim) goto redirty_out; - mutex_lock(&sbi->node_write); + down_read(&sbi->node_write); set_page_writeback(page); write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - mutex_unlock(&sbi->node_write); + up_read(&sbi->node_write); unlock_page(page); return 0; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f253e222dcea..657582fc7601 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -953,7 +953,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - mutex_init(&sbi->node_write); + init_rwsem(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); From 70407fad85f2ec87a0cf56057c3267cd3aa22768 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 31 Jul 2014 21:11:22 +0800 Subject: [PATCH 36/40] f2fs: add tracepoint for f2fs_direct_IO This patch adds a tracepoint for f2fs_direct_IO. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++ include/trace/events/f2fs.h | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ec3c8860539e..03313099c51c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1086,9 +1086,14 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, /* clear fsync mark to recover these blocks */ fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); if (err < 0 && (rw & WRITE)) f2fs_write_failed(mapping, offset + count); + + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); + return err; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7d2e70e75f16..d06d44363fea 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -587,6 +587,69 @@ TRACE_EVENT(f2fs_fallocate, __entry->ret) ); +TRACE_EVENT(f2fs_direct_IO_enter, + + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + + TP_ARGS(inode, offset, len, rw), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, pos) + __field(unsigned long, len) + __field(int, rw) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + ), + + TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu rw = %d", + show_dev_ino(__entry), + __entry->pos, + __entry->len, + __entry->rw) +); + +TRACE_EVENT(f2fs_direct_IO_exit, + + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, + int rw, int ret), + + TP_ARGS(inode, offset, len, rw, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, pos) + __field(unsigned long, len) + __field(int, rw) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + __entry->ret = ret; + ), + + TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu " + "rw = %d ret = %d", + show_dev_ino(__entry), + __entry->pos, + __entry->len, + __entry->rw, + __entry->ret) +); + TRACE_EVENT(f2fs_reserve_new_block, TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), From 70cfed88efa760fd165fc413cfd1801b5cc8acd2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 2 Aug 2014 15:26:04 +0800 Subject: [PATCH 37/40] f2fs: avoid skipping recover_inline_xattr after recover_inline_data When we recover data of inode in roll-forward procedure, and the inode has both inline data and inline xattr. We may skip recovering inline xattr if we recover inline data form node page first. This patch will fix the problem that we lost inline xattr data in above scenario. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/node.c | 4 +--- fs/f2fs/recovery.c | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 99425675c2f8..4dab5338a97a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1204,6 +1204,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +void recover_inline_xattr(struct inode *, struct page *); bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7b5b5def65fe..d3d90d284631 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1549,7 +1549,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, clear_node_page_dirty(page); } -static void recover_inline_xattr(struct inode *inode, struct page *page) +void recover_inline_xattr(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; @@ -1588,8 +1588,6 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) nid_t new_xnid = nid_of_node(page); struct node_info ni; - recover_inline_xattr(inode, page); - if (!f2fs_has_xattr_block(ofs_of_node(page))) return false; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b2aa53b99f64..fe1c6d921ba2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -300,6 +300,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; + recover_inline_xattr(inode, page); + if (recover_inline_data(inode, page)) goto out; From 002a41cabb5829d59c0337dcb5fa3893e0bb15fd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Aug 2014 09:54:58 +0800 Subject: [PATCH 38/40] f2fs: invalidate xattr node page when evict inode When inode is evicted, all the page cache belong to this inode should be released including the xattr node page. But previously we didn't do this, this patch fixed this issue. v2: o reposition invalidate_mapping_pages() to the right place suggested by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0e69aa90238d..2c39999f3868 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -267,6 +267,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -296,6 +297,8 @@ void f2fs_evict_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (xnid) + invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE)) add_dirty_inode(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) From 497a0930bb4bd148451e1af239a082e3916b681b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Aug 2014 10:11:17 +0800 Subject: [PATCH 39/40] f2fs: add f2fs_balance_fs for expand_inode_data This patch adds f2fs_balance_fs in expand_inode_data to avoid allocation failure with segment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2a91a7d1760c..208f1a9bd569 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -672,6 +672,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; + f2fs_balance_fs(sbi); + ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; From b65ee14818e67127aa242fe1dbd3711b9c095cc0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Aug 2014 10:10:07 +0800 Subject: [PATCH 40/40] f2fs: use for_each_set_bit to simplify the code This patch uses for_each_set_bit to simplify some codes in f2fs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 ++----- fs/f2fs/segment.c | 13 ++++--------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b90dbe55403a..d7947d90ccc3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int hint = 0; unsigned int secno; /* @@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ -next: - secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); - if (secno < TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { if (sec_usage_check(sbi, secno)) - goto next; + continue; clear_bit(secno, dirty_i->victim_secmap); return secno * sbi->segs_per_sec; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c3b76d091450..0dfeebae2a50 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -439,17 +439,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno = -1; + unsigned int segno; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); - while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - segno + 1); - if (segno >= total_segs) - break; + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) __set_test_and_free(sbi, segno); - } mutex_unlock(&dirty_i->seglist_lock); } @@ -1531,7 +1526,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start = 0, end = 0; - unsigned int segno = -1; + unsigned int segno; bool flushed; mutex_lock(&curseg->curseg_mutex); @@ -1543,7 +1538,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) */ flushed = flush_sits_in_journal(sbi); - while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { + for_each_set_bit(segno, bitmap, nsegs) { struct seg_entry *se = get_seg_entry(sbi, segno); int sit_offset, offset;