diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..1da5753d886d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee9be4199e7c..c1ee4ade2d87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7145,6 +7145,43 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7160,43 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); - - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } + return em; } @@ -7670,57 +7677,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1 && btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; - /* - * Create the ordered extent before the extent map. This - * is to avoid races with the fast fsync path because it - * collects ordered extents into a local list and then - * collects all the new extent maps, so we must create - * the ordered extent first and make sure the fast fsync - * path collects any new ordered extents after - * collecting new extent maps as well. The fsync path - * simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); btrfs_dec_nocow_writers(root->fs_info, block_start); - if (ret) { - free_extent_map(em); - goto unlock_err; - } - if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - ret = PTR_ERR(em); - oe = btrfs_lookup_ordered_extent(inode, - start); - ASSERT(oe); - if (WARN_ON(!oe)) - goto unlock_err; - set_bit(BTRFS_ORDERED_IOERR, - &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, - &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* - * Once for our lookup and once for the - * ordered extents tree. - */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - goto unlock_err; - } + em = em2; + } + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto unlock_err; } - goto unlock; } } @@ -9281,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a24a0ba523d6..003a826f4cff 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. - */ btrfs_get_logged_extents(inode, logged_list, start, end); + /* + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4210,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4622,23 +4631,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -4846,21 +4838,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) {