diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ccd25ba7a9ac..9a8622a5b867 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -5,6 +5,9 @@ config BTRFS_FS select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select RAID6_PQ + select XOR_BLOCKS + help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69be875..bd605c87adfd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, err = __resolve_indirect_ref(fs_info, search_commit_root, time_seq, ref, parents, extent_item_pos); - if (err) { - if (ret == 0) - ret = err; + if (err) continue; - } /* we put the first parent into the ref at hand */ ULIST_ITER_INIT(&uiter); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca79455..310a7f6d09b1 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -19,7 +19,7 @@ #ifndef __BTRFS_BACKREF__ #define __BTRFS_BACKREF__ -#include "ioctl.h" +#include #include "ulist.h" #include "extent_io.h" diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242bc4f5..d9b97d4960e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -40,6 +40,8 @@ #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 #define BTRFS_INODE_NEEDS_FULL_SYNC 7 #define BTRFS_INODE_COPY_EVERYTHING 8 +#define BTRFS_INODE_IN_DELALLOC_LIST 9 +#define BTRFS_INODE_READDIO_NEED_LOCK 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +/* + * Disable DIO read nolock optimization, so new dio readers will be forced + * to grab i_mutex. It is used to avoid the endless truncate due to + * nonlocked dio read. + */ +static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) +{ + set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); + smp_mb(); +} + +static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) +{ + smp_mb__before_clear_bit(); + clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags); +} + #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bfb62b4..18af6f48781a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror( (bh->b_data + (dev_bytenr & 4095)); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, - sizeof(super_tmp->magic)) || + super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_leafsize(super_tmp) != state->metablock_size || diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e3..15b94089abc4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, PAGE_CACHE_SIZE, bio, 0); else @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_CACHE_SHIFT; if (comp_bio->bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + ret = tree->ops->merge_bio_hook(READ, page, 0, PAGE_CACHE_SIZE, comp_bio, 0); else diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7a2b9a..ecd25a1b4e51 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); + /* Fallthrough */ case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); @@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, __tree_mod_log_rewind(eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); return eb_rewin; } @@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) */ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress) { struct extent_buffer *cur; @@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; parent_level = btrfs_header_level(parent); - if (cache_only && parent_level != 1) - return 0; WARN_ON(trans->transaction != root->fs_info->running_transaction); WARN_ON(trans->transid != root->fs_info->generation); @@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, else uptodate = 0; if (!cur || !uptodate) { - if (cache_only) { - free_extent_buffer(cur); - continue; - } if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); @@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) /* * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, and tree logging + * for nodes or leaves that are have a minimum transaction id. + * This is used by the btree defrag code, and tree logging * * This does not cow, but it does stuff the starting key it finds back * into min_key, so you can call btrfs_search_slot with cow=1 on the @@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans) { struct extent_buffer *cur; @@ -4887,15 +4882,12 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, if (sret && slot > 0) slot--; /* - * check this node pointer against the cache_only and - * min_trans parameters. If it isn't in cache or is too - * old, skip to the next one. + * check this node pointer against the min_trans parameters. + * If it is too old, old, skip to the next one. */ while (slot < nritems) { u64 blockptr; u64 gen; - struct extent_buffer *tmp; - struct btrfs_disk_key disk_key; blockptr = btrfs_node_blockptr(cur, slot); gen = btrfs_node_ptr_generation(cur, slot); @@ -4903,27 +4895,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, slot++; continue; } - if (!cache_only) - break; - - if (max_key) { - btrfs_node_key(cur, &disk_key, slot); - if (comp_keys(&disk_key, max_key) >= 0) { - ret = 1; - goto out; - } - } - - tmp = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - - if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - free_extent_buffer(tmp); - break; - } - if (tmp) - free_extent_buffer(tmp); - slot++; + break; } find_next_key: /* @@ -4934,7 +4906,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, path->slots[level] = slot; btrfs_set_path_blocking(path); sret = btrfs_find_next_key(root, path, min_key, level, - cache_only, min_trans); + min_trans); if (sret == 0) { btrfs_release_path(path); goto again; @@ -5399,8 +5371,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, /* * this is similar to btrfs_next_leaf, but does not try to preserve * and fixup the path. It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. + * tree based on the current path and the min_trans parameters. * * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree @@ -5409,8 +5380,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int level, - int cache_only, u64 min_trans) + struct btrfs_key *key, int level, u64 min_trans) { int slot; struct extent_buffer *c; @@ -5461,22 +5431,8 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { - u64 blockptr = btrfs_node_blockptr(c, slot); u64 gen = btrfs_node_ptr_generation(c, slot); - if (cache_only) { - struct extent_buffer *cur; - cur = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - if (!cur || - btrfs_buffer_uptodate(cur, gen, 1) <= 0) { - slot++; - if (cur) - free_extent_buffer(cur); - goto next; - } - free_extent_buffer(cur); - } if (gen < min_trans) { slot++; goto next; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b05727f..0d82922179db 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -31,10 +31,10 @@ #include #include #include +#include #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" -#include "ioctl.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; -#define BTRFS_MAGIC "_BHRfS_M" +#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) +#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) /* * File system states */ +#define BTRFS_FS_STATE_ERROR 0 +#define BTRFS_FS_STATE_REMOUNTING 1 +/* Super block flags */ /* Errors detected */ #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) @@ -502,6 +507,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -511,6 +517,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) /* @@ -952,8 +959,20 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ BTRFS_BLOCK_GROUP_SYSTEM | \ @@ -961,6 +980,8 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) /* @@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache { u64 flags; u64 sectorsize; u64 cache_generation; + + /* for raid56, this is a full stripe, without parity */ + unsigned long full_stripe_len; + unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -1225,6 +1250,28 @@ struct seq_list { u64 seq; }; +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash { + struct list_head hash_list; + wait_queue_head_t wait; + spinlock_t lock; +}; + +/* used by the raid56 code to lock stripes for read/modify/write */ +struct btrfs_stripe_hash_table { + struct list_head stripe_cache; + spinlock_t cache_lock; + int cache_size; + struct btrfs_stripe_hash table[]; +}; + +#define BTRFS_STRIPE_HASH_TABLE_BITS 11 + /* fs_info */ struct reloc_control; struct btrfs_device; @@ -1250,6 +1297,7 @@ struct btrfs_fs_info { /* block group cache stuff */ spinlock_t block_group_cache_lock; + u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ @@ -1288,7 +1336,23 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ u64 max_inline; + /* + * Protected by ->chunk_mutex and sb->s_umount. + * + * The reason that we use two lock to protect it is because only + * remount and mount operations can change it and these two operations + * are under sb->s_umount, but the read side (chunk allocation) can not + * acquire sb->s_umount or the deadlock would happen. So we use two + * locks to protect it. On the write side, we must acquire two locks, + * and on the read side, we just need acquire one of them. + */ u64 alloc_start; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; @@ -1307,6 +1371,13 @@ struct btrfs_fs_info { struct mutex cleaner_mutex; struct mutex chunk_mutex; struct mutex volume_mutex; + + /* this is used during read/modify/write to make sure + * no two ios are trying to mod the same stripe at the same + * time + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -1365,6 +1436,7 @@ struct btrfs_fs_info { */ struct list_head ordered_extents; + spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered @@ -1372,13 +1444,6 @@ struct btrfs_fs_info { */ struct list_head delalloc_inodes; - /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers @@ -1395,6 +1460,8 @@ struct btrfs_fs_info { struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_raid56_workers; + struct btrfs_workers rmw_workers; struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers endio_freespace_worker; @@ -1423,10 +1490,12 @@ struct btrfs_fs_info { u64 total_pinned; - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; + /* used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -1442,9 +1511,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - spinlock_t delalloc_lock; - u64 delalloc_bytes; - /* data_alloc_cluster is only used in ssd mode */ struct btrfs_free_cluster data_alloc_cluster; @@ -1456,6 +1522,8 @@ struct btrfs_fs_info { struct rb_root defrag_inodes; atomic_t defrag_running; + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other @@ -1520,7 +1588,7 @@ struct btrfs_fs_info { u64 qgroup_seq; /* filesystem state */ - u64 fs_state; + unsigned long fs_state; struct btrfs_delayed_root *delayed_root; @@ -1623,6 +1691,9 @@ struct btrfs_root { struct list_head root_list; + spinlock_t log_extents_lock[2]; + struct list_head logged_list[2]; + spinlock_t orphan_lock; atomic_t orphan_inodes; struct btrfs_block_rsv *orphan_block_rsv; @@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) /* @@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, + u64 *qgroup_reserved); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); @@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, - int cache_only, u64 min_trans); + u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, + struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, @@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, + int start_slot, u64 *last_ret, struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); @@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, unsigned long bio_flags); - +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only); + struct btrfs_root *root); /* sysfs.c */ int btrfs_init_sysfs(void); @@ -3620,11 +3696,14 @@ __printf(5, 6) void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ #define btrfs_panic(fs_info, errno, fmt, args...) \ do { \ - struct btrfs_fs_info *_i = (fs_info); \ - __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ - BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ } while (0) /* acl.c */ @@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid) return 1; return 0; } + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 34836036f01b..0b278b117cbe 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *delayed_item) { struct extent_buffer *leaf; - struct btrfs_item *item; char *ptr; int ret; @@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr(leaf, path->slots[0], char); write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, @@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) } } -static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_node *node) +static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) { struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; int ret; - mutex_lock(&node->mutex); - if (!node->inode_dirty) { - mutex_unlock(&node->mutex); - return 0; - } - key.objectid = node->inode_id; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; + ret = btrfs_lookup_inode(trans, root, path, &key, 1); if (ret > 0) { btrfs_release_path(path); - mutex_unlock(&node->mutex); return -ENOENT; } else if (ret < 0) { - mutex_unlock(&node->mutex); return ret; } @@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_delayed_inode_release_metadata(root, node); btrfs_release_delayed_inode(node); - mutex_unlock(&node->mutex); return 0; } +static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + mutex_lock(&node->mutex); + if (!node->inode_dirty) { + mutex_unlock(&node->mutex); + return 0; + } + + ret = __btrfs_update_delayed_inode(trans, root, path, node); + mutex_unlock(&node->mutex); + return ret; +} + +static inline int +__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_delayed_node *node) +{ + int ret; + + ret = btrfs_insert_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_delete_delayed_items(trans, path, node->root, node); + if (ret) + return ret; + + ret = btrfs_update_delayed_inode(trans, node->root, path, node); + return ret; +} + /* * Called when committing the transaction. * Returns 0 on success. @@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int nr) { - struct btrfs_root *curr_root = root; struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; @@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node && (!count || (count && nr--))) { - curr_root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, curr_root, - curr_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, - curr_root, curr_node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, curr_root, - path, curr_node); + ret = __btrfs_commit_inode_delayed_items(trans, path, + curr_node); if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; @@ -1183,36 +1202,12 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, return __btrfs_run_delayed_items(trans, root, nr); } -static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_delayed_node *node) -{ - struct btrfs_path *path; - struct btrfs_block_rsv *block_rsv; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->delayed_block_rsv; - - ret = btrfs_insert_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - btrfs_free_path(path); - - trans->block_rsv = block_rsv; - return ret; -} - int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; if (!delayed_node) @@ -1226,8 +1221,74 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, } mutex_unlock(&delayed_node->mutex); - ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + btrfs_release_delayed_node(delayed_node); + btrfs_free_path(path); + trans->block_rsv = block_rsv; + + return ret; +} + +int btrfs_commit_inode_delayed_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); + struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; + int ret; + + if (!delayed_node) + return 0; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; + } + mutex_unlock(&delayed_node->mutex); + + trans = btrfs_join_transaction(delayed_node->root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto trans_out; + } + path->leave_spinning = 1; + + block_rsv = trans->block_rsv; + trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + + mutex_lock(&delayed_node->mutex); + if (delayed_node->inode_dirty) + ret = __btrfs_update_delayed_inode(trans, delayed_node->root, + path, delayed_node); + else + ret = 0; + mutex_unlock(&delayed_node->mutex); + + btrfs_free_path(path); + trans->block_rsv = block_rsv; +trans_out: + btrfs_end_transaction(trans, delayed_node->root); + btrfs_btree_balance_dirty(delayed_node->root); +out: + btrfs_release_delayed_node(delayed_node); + return ret; } @@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_root *root; struct btrfs_block_rsv *block_rsv; int need_requeue = 0; - int ret; async_node = container_of(work, struct btrfs_async_delayed_node, work); @@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) block_rsv = trans->block_rsv; trans->block_rsv = &root->fs_info->delayed_block_rsv; - ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - delayed_node); - - if (!ret) - btrfs_update_delayed_inode(trans, root, path, delayed_node); - + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); /* * Maybe new delayed items have been inserted, so we need requeue * the work. Besides that, we must dequeue the empty delayed nodes diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1baeed..78b6ad0fc669 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, /* Used for evicting the inode. */ void btrfs_remove_delayed_node(struct inode *inode); void btrfs_kill_delayed_inode_items(struct inode *inode); +int btrfs_commit_inode_delayed_inode(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae9411773397..b7a0641ead77 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -23,6 +23,10 @@ #include "delayed-ref.h" #include "transaction.h" +struct kmem_cache *btrfs_delayed_ref_head_cachep; +struct kmem_cache *btrfs_delayed_tree_ref_cachep; +struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for @@ -422,6 +426,14 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, return 1; } +void btrfs_release_ref_cluster(struct list_head *cluster) +{ + struct list_head *pos, *q; + + list_for_each_safe(pos, q, cluster) + list_del_init(pos); +} + /* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same @@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, ref->extent_op->flags_to_set; existing_ref->extent_op->update_flags = 1; } - kfree(ref->extent_op); + btrfs_free_delayed_extent_op(ref->extent_op); } } /* @@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(head_ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); } else { delayed_refs->num_heads++; delayed_refs->num_heads_ready++; @@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, * we've updated the existing ref, free the newly * allocated ref */ - kfree(full_ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } else { delayed_refs->num_entries++; trans->delayed_ref_updates++; @@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); return -ENOMEM; } @@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs; BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); + ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kfree(ref); + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); return -ENOMEM; } @@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; @@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) return btrfs_delayed_node_to_head(ref); return NULL; } + +void btrfs_delayed_ref_exit(void) +{ + if (btrfs_delayed_ref_head_cachep) + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); + if (btrfs_delayed_tree_ref_cachep) + kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); + if (btrfs_delayed_data_ref_cachep) + kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + if (btrfs_delayed_extent_op_cachep) + kmem_cache_destroy(btrfs_delayed_extent_op_cachep); +} + +int btrfs_delayed_ref_init(void) +{ + btrfs_delayed_ref_head_cachep = kmem_cache_create( + "btrfs_delayed_ref_head", + sizeof(struct btrfs_delayed_ref_head), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_ref_head_cachep) + goto fail; + + btrfs_delayed_tree_ref_cachep = kmem_cache_create( + "btrfs_delayed_tree_ref", + sizeof(struct btrfs_delayed_tree_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_tree_ref_cachep) + goto fail; + + btrfs_delayed_data_ref_cachep = kmem_cache_create( + "btrfs_delayed_data_ref", + sizeof(struct btrfs_delayed_data_ref), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_data_ref_cachep) + goto fail; + + btrfs_delayed_extent_op_cachep = kmem_cache_create( + "btrfs_delayed_extent_op", + sizeof(struct btrfs_delayed_extent_op), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_delayed_extent_op_cachep) + goto fail; + + return 0; +fail: + btrfs_delayed_ref_exit(); + return -ENOMEM; +} diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df0..f75fcaf79aeb 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -131,6 +131,15 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + /* + * bumped when someone is making progress on the delayed + * refs, so that other procs know they are just adding to + * contention intead of helping + */ + atomic_t procs_running_refs; + atomic_t ref_seq; + wait_queue_head_t wait; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need @@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root { u64 run_delayed_start; }; +extern struct kmem_cache *btrfs_delayed_ref_head_cachep; +extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; +extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + +int btrfs_delayed_ref_init(void); +void btrfs_delayed_ref_exit(void); + +static inline struct btrfs_delayed_extent_op * +btrfs_alloc_delayed_extent_op(void) +{ + return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); +} + +static inline void +btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) +{ + if (op) + kmem_cache_free(btrfs_delayed_extent_op_cachep, op); +} + static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(atomic_read(&ref->refs) == 0); if (atomic_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); - kfree(ref); + switch (ref->type) { + case BTRFS_TREE_BLOCK_REF_KEY: + case BTRFS_SHARED_BLOCK_REF_KEY: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + case BTRFS_SHARED_DATA_REF_KEY: + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + break; + case 0: + kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); + break; + default: + BUG(); + } } } @@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head); +static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) +{ + mutex_unlock(&head->mutex); +} + int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, struct list_head *cluster, u64 search_start); +void btrfs_release_ref_cluster(struct list_head *cluster); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8dbddf7..7ba7b3900cb8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - btrfs_start_delalloc_inodes(root, 0); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return ret; + } btrfs_wait_ordered_extents(root, 0); trans = btrfs_start_transaction(root, 0); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652dc940b..02369a3c162e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" +#include "raid56.h" #ifdef CONFIG_X86 #include @@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) { struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; @@ -639,8 +641,15 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, btree_readahead_hook(root, eb, eb->start, ret); } - if (ret) + if (ret) { + /* + * our io error hook is going to dec the io pages + * again, we have to make sure it has something + * to decrement + */ + atomic_inc(&eb->io_pages); clear_extent_buffer_uptodate(eb); + } free_extent_buffer(eb); out: return ret; @@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); eb->read_mirror = failed_mirror; + atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ @@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == 1) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); - else if (end_io_wq->metadata == 2) + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) btrfs_queue_worker(&fs_info->endio_freespace_worker, &end_io_wq->work); + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); } else { - if (end_io_wq->metadata) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + btrfs_queue_worker(&fs_info->endio_raid56_workers, + &end_io_wq->work); + else if (end_io_wq->metadata) btrfs_queue_worker(&fs_info->endio_meta_workers, &end_io_wq->work); else @@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) * 0 - if data * 1 - if normal metadta * 2 - if writing to the free space cache area + * 3 - raid parity work */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) @@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_io_tree *tree; + struct btrfs_fs_info *fs_info; + int ret; + tree = &BTRFS_I(mapping->host)->io_tree; if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; if (wbc->for_kupdate) return 0; + fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); @@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { + fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%llu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->logged_list[0]); + INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->accounting_lock); + spin_lock_init(&root->log_extents_lock[0]); + spin_lock_init(&root->log_extents_lock[1]); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + if (ret) { + err = ret; + goto fail_bdi; + } + fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * + (1 + ilog2(nr_cpu_ids)); + + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + if (ret) { + err = ret; + goto fail_dirty_metadata_bytes; + } + fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; - goto fail_bdi; + goto fail_delalloc_bytes; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->trans_lock); @@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->tree_mod_seq_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); + seqlock_init(&fs_info->profiles_lock); init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; + fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping); @@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + ret = btrfs_alloc_stripe_hash_table(fs_info); + if (ret) { + err = ret; + goto fail_alloc; + } + __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); @@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb, goto fail_alloc; /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { @@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb, leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); + fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* * mixed block groups end up with duplicate but slightly offset @@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_meta_write_workers, "endio-meta-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_raid56_workers, + "endio-raid56", fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->rmw_workers, + "rmw", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); @@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb, */ fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; + fs_info->endio_raid56_workers.idle_thresh = 4; + fs_info->rmw_workers.idle_thresh = 2; fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; @@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->fixup_workers); ret |= btrfs_start_workers(&fs_info->endio_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->rmw_workers); + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); ret |= btrfs_start_workers(&fs_info->endio_write_workers); ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); @@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { + if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2694,13 +2742,13 @@ int open_ctree(struct super_block *sb, * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_block_groups: btrfs_free_block_groups(fs_info); fail_tree_roots: free_root_pointers(fs_info, 1); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); @@ -2710,6 +2758,8 @@ int open_ctree(struct super_block *sb, btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -2721,13 +2771,17 @@ int open_ctree(struct super_block *sb, fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); +fail_delalloc_bytes: + percpu_counter_destroy(&fs_info->delalloc_bytes); +fail_dirty_metadata_bytes: + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; @@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { + super->magic != cpu_to_le64(BTRFS_MAGIC)) { brelse(bh); continue; } @@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))) num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 - && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; + else if (num_tolerated_disk_barrier_failures > 1) { + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID10)) { + num_tolerated_disk_barrier_failures = 1; + } else if (flags & + BTRFS_BLOCK_GROUP_RAID5) { + num_tolerated_disk_barrier_failures = 2; + } + } } } up_read(&sinfo->groups_sem); @@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + btrfs_free_log(NULL, root); + btrfs_free_log_root_tree(NULL, fs_info); + } + __btrfs_remove_free_space_cache(root->free_ino_pinned); __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); @@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) btrfs_error_commit_super(root); btrfs_put_block_group_cache(fs_info); @@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_qgroup_config(root->fs_info); - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); + if (percpu_counter_sum(&fs_info->delalloc_bytes)) { + printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", + percpu_counter_sum(&fs_info->delalloc_bytes)); } free_extent_buffer(fs_info->extent_root->node); @@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->workers); btrfs_stop_workers(&fs_info->endio_workers); btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); @@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root) btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); + percpu_counter_destroy(&fs_info->delalloc_bytes); bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); + btrfs_free_stripe_hash_table(fs_info); + return 0; } @@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) (unsigned long long)transid, (unsigned long long)root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } + if (!was_dirty) + __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + buf->len, + root->fs_info->dirty_metadata_batch); } static void __btrfs_btree_balance_dirty(struct btrfs_root *root, @@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; + int ret; if (current->flags & PF_MEMALLOC) return; @@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, if (flush_delayed) btrfs_balance_delayed_items(root); - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { + ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + BTRFS_DIRTY_METADATA_THRESH); + if (ret > 0) { balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } @@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, + struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; @@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_operations, &splice); + list_splice_init(&t->ordered_operations, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { - struct list_head splice; struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - ordered = list_entry(splice.next, struct btrfs_ordered_extent, - root_extent_list); - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* the inode may be getting freed (in sys_unlink path). */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) - iput(inode); - - atomic_set(&ordered->refs, 1); - btrfs_put_ordered_extent(ordered); - - spin_lock(&root->fs_info->ordered_extent_lock); - } - + /* + * This will just short circuit the ordered completion stuff which will + * make sure the ordered extent gets properly cleaned up. + */ + list_for_each_entry(ordered, &root->fs_info->ordered_extents, + root_extent_list) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->fs_info->ordered_extent_lock); } @@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, } while ((node = rb_first(&delayed_refs->root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + struct btrfs_delayed_ref_head *head = NULL; + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); if (!mutex_trylock(&head->mutex)) { @@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); } + ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - + if (head) + mutex_unlock(&head->mutex); spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); @@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) delalloc_inodes); list_del_init(&btrfs_inode->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &btrfs_inode->runtime_flags); btrfs_invalidate_inodes(btrfs_inode->root); } @@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); - if (!t) - break; - btrfs_destroy_ordered_operations(root); + btrfs_destroy_ordered_operations(t, root); btrfs_destroy_ordered_extents(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,13 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 +enum { + BTRFS_WQ_ENDIO_DATA = 0, + BTRFS_WQ_ENDIO_METADATA = 1, + BTRFS_WQ_ENDIO_FREE_SPACE = 2, + BTRFS_WQ_ENDIO_RAID56 = 3, +}; + static inline u64 btrfs_sb_offset(int mirror) { u64 start = 16 * 1024; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cf54bdfee334..3e074dab2d57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "print-tree.h" #include "transaction.h" #include "volumes.h" +#include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "math.h" @@ -72,8 +73,7 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_link_node(&block_group->cache_node, parent, p); rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); + + if (info->first_logical_byte > block_group->key.objectid) + info->first_logical_byte = block_group->key.objectid; + spin_unlock(&info->block_group_cache_lock); return 0; @@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, break; } } - if (ret) + if (ret) { btrfs_get_block_group(ret); + if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) + info->first_logical_byte = ret->key.objectid; + } spin_unlock(&info->block_group_cache_lock); return ret; @@ -468,8 +477,6 @@ static noinline void caching_thread(struct btrfs_work *work) } static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, int load_cache_only) { DEFINE_WAIT(wait); @@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, *actual_bytes = discarded_bytes; + if (ret == -EOPNOTSUPP) + ret = 0; return ret; } @@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } - mutex_unlock(&head->mutex); return ret; } @@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * process of being added. Don't run this ref yet. */ list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); @@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref = &locked_ref->node; if (extent_op && must_insert_reserved) { - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); extent_op = NULL; } @@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_delayed_extent_op(trans, root, ref, extent_op); - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); if (ret) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - - printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + printk(KERN_DEBUG + "btrfs: run_delayed_extent_op " + "returned %d\n", ret); spin_lock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); return ret; } goto next; } - - list_del_init(&locked_ref->cluster); - locked_ref = NULL; } ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; - if (locked_ref) { + if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the * ref_mod on head @@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); - btrfs_put_delayed_ref(ref); - kfree(extent_op); - count++; - + btrfs_free_delayed_extent_op(extent_op); if (ret) { - if (locked_ref) { - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - } - printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_delayed_ref_unlock(locked_ref); + btrfs_put_delayed_ref(ref); + printk(KERN_DEBUG + "btrfs: run_one_delayed_ref returned %d\n", ret); spin_lock(&delayed_refs->lock); return ret; } + /* + * If this node is a head, that means all the refs in this head + * have been dealt with, and we will pick the next head to deal + * with, so we must unlock the head and drop it from the cluster + * list before we release it. + */ + if (btrfs_delayed_ref_is_head(ref)) { + list_del_init(&locked_ref->cluster); + btrfs_delayed_ref_unlock(locked_ref); + locked_ref = NULL; + } + btrfs_put_delayed_ref(ref); + count++; next: cond_resched(); spin_lock(&delayed_refs->lock); @@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } +static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, + int count) +{ + int val = atomic_read(&delayed_refs->ref_seq); + + if (val < seq || val >= seq + count) + return 1; + return 0; +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; INIT_LIST_HEAD(&cluster); + if (count == 0) { + count = delayed_refs->num_entries * 2; + run_most = 1; + } + + if (!run_all && !run_most) { + int old; + int seq = atomic_read(&delayed_refs->ref_seq); + +progress: + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + DEFINE_WAIT(__wait); + if (delayed_refs->num_entries < 16348) + return 0; + + prepare_to_wait(&delayed_refs->wait, &__wait, + TASK_UNINTERRUPTIBLE); + + old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); + if (old) { + schedule(); + finish_wait(&delayed_refs->wait, &__wait); + + if (!refs_newer(delayed_refs, seq, 256)) + goto progress; + else + return 0; + } else { + finish_wait(&delayed_refs->wait, &__wait); + goto again; + } + } + + } else { + atomic_inc(&delayed_refs->procs_running_refs); + } + again: loops = 0; spin_lock(&delayed_refs->lock); @@ -2477,10 +2533,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } while (1) { if (!(run_all || run_most) && delayed_refs->num_heads_ready < 64) @@ -2500,11 +2552,15 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, ret = run_clustered_refs(trans, root, &cluster); if (ret < 0) { + btrfs_release_ref_cluster(&cluster); spin_unlock(&delayed_refs->lock); btrfs_abort_transaction(trans, root, ret); + atomic_dec(&delayed_refs->procs_running_refs); return ret; } + atomic_add(ret, &delayed_refs->ref_seq); + count -= min_t(unsigned long, ret, count); if (count == 0) @@ -2573,6 +2629,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, goto again; } out: + atomic_dec(&delayed_refs->procs_running_refs); + smp_mb(); + if (waitqueue_active(&delayed_refs->wait)) + wake_up(&delayed_refs->wait); + spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; @@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op; int ret; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; @@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, num_bytes, extent_op); if (ret) - kfree(extent_op); + btrfs_free_delayed_extent_op(extent_op); return ret; } @@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits |= extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits |= extra_flags; + write_sequnlock(&fs_info->profiles_lock); } /* @@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) u64 num_devices = root->fs_info->fs_devices->rw_devices + root->fs_info->fs_devices->missing_devices; u64 target; + u64 tmp; /* * see if restripe for this chunk_type is in progress, if so @@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) } spin_unlock(&root->fs_info->balance_lock); + /* First, mask out the RAID levels which aren't possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } - - return extended_to_chunk(flags); + return extended_to_chunk(flags | tmp); } static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; + unsigned seq; + + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); return btrfs_reduce_alloc_profile(root, flags); } @@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; + u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; @@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) else flags = BTRFS_BLOCK_GROUP_METADATA; - return get_alloc_profile(root, flags); + ret = get_alloc_profile(root, flags); + return ret; } /* @@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); if (root == root->fs_info->tree_root || BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { @@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes = ALIGN(bytes, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) { u64 num_dev; - if (type & BTRFS_BLOCK_GROUP_RAID10 || - type & BTRFS_BLOCK_GROUP_RAID0) + if (type & (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) num_dev = root->fs_info->fs_devices->rw_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_dev = 2; @@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, int wait_for_alloc = 0; int ret = 0; + /* Don't re-enter if we're already allocating a chunk */ + if (trans->allocating_chunk) + return -ENOSPC; + space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, @@ -3606,6 +3686,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto again; } + trans->allocating_chunk = true; + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3632,19 +3714,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, flags); ret = btrfs_alloc_chunk(trans, extent_root, flags); - if (ret < 0 && ret != -ENOSPC) - goto out; + trans->allocating_chunk = false; spin_lock(&space_info->lock); + if (ret < 0 && ret != -ENOSPC) + goto out; if (ret) space_info->full = 1; else ret = 1; space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&fs_info->chunk_mutex); return ret; } @@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); + u64 rsv_size = 0; u64 avail; u64 used; + u64 to_add; used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + space_info->bytes_pinned + space_info->bytes_readonly; + + spin_lock(&global_rsv->lock); + rsv_size = global_rsv->size; + spin_unlock(&global_rsv->lock); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + rsv_size <<= 1; + if (used + rsv_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; @@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. + * space is actually useable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math */ if (profile & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) avail >>= 1; + to_add = space_info->total_bytes; + /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit * too much, let it overcommit up to 1/8 of the space. */ if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; + to_add >>= 3; else - avail >>= 1; + to_add >>= 1; - if (used + bytes < space_info->total_bytes + avail) + /* + * Limit the overcommit to the amount of free space we could possibly + * allocate for chunks. + */ + to_add = min(avail, to_add); + + if (used + bytes < space_info->total_bytes + to_add) return 1; return 0; } +void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) +{ + struct super_block *sb = root->fs_info->sb; + int started; + + /* If we can not start writeback, just sync all the delalloc file. */ + started = try_to_writeback_inodes_sb_nr(sb, nr_pages, + WB_REASON_FS_FREE_SPACE); + if (!started) { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0); + } +} + /* * shrink metadata reservation for delalloc */ @@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, space_info = block_rsv->space_info; smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; @@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; - try_to_writeback_inodes_sb_nr(root->fs_info->sb, - nr_pages, - WB_REASON_FS_FREE_SPACE); - + btrfs_writeback_inodes_sb_nr(root, nr_pages); /* * We need to wait for the async pages to actually start before * we do anything. @@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, break; } smp_mb(); - delalloc_bytes = root->fs_info->delalloc_bytes; + delalloc_bytes = percpu_counter_sum_positive( + &root->fs_info->delalloc_bytes); } } @@ -4016,6 +4148,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root, goto again; out: + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + struct btrfs_block_rsv *global_rsv = + &root->fs_info->global_block_rsv; + + if (block_rsv != global_rsv && + !block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } if (flushing) { spin_lock(&space_info->lock); space_info->flush = 0; @@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * qgroup_reserved: used to return the reserved size in qgroup + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reseravtion mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int items, + u64 *qgroup_reserved) { - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries, - * one for root of the snapshot and one for parent inode. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + u64 num_bytes; + int ret; + + if (root->fs_info->quota_enabled) { + /* One for parent inode, two for dir entries */ + num_bytes = 3 * root->leafsize; + ret = btrfs_qgroup_reserve(root, num_bytes); + if (ret) + return ret; + } else { + num_bytes = 0; + } + + *qgroup_reserved = num_bytes; + + num_bytes = btrfs_calc_trans_metadata_size(root, items); + rsv->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) { + if (*qgroup_reserved) + btrfs_qgroup_free(root, *qgroup_reserved); + } + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + u64 qgroup_reserved) +{ + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; + u64 to_free = 0; + unsigned dropped; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) + if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); + if (ret) + goto out_fail; + } - /* - * ret != 0 here means the qgroup reservation failed, we go straight to - * the shared error handling then. - */ - if (ret == 0) - ret = reserve_metadata_bytes(root, block_rsv, - to_reserve, flush); - - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - if (root->fs_info->quota_enabled) { + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + if (unlikely(ret)) { + if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + nr_extents * root->leafsize); - } - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; + goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); @@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; + +out_fail: + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); + /* + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + * Otherwise we have to do the normal free thing to account for + * the case that the free side didn't free up its reserve + * because of this outstanding reservation. + */ + if (BTRFS_I(inode)->csum_bytes == csum_bytes) + calc_csum_metadata_size(inode, num_bytes, 0); + else + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) { + btrfs_block_rsv_release(root, block_rsv, to_free); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), to_free, 0); + } + if (delalloc_lock) + mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); + return ret; } /** @@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + if (num_bytes) + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int update_block_group(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; @@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); + cache_block_group(cache, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) struct btrfs_block_group_cache *cache; u64 bytenr; + spin_lock(&root->fs_info->block_group_cache_lock); + bytenr = root->fs_info->first_logical_byte; + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (bytenr < (u64)-1) + return bytenr; + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); if (!cache) return 0; @@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; @@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * to one because the slow code to read in the free extents does check * the pinned extents. */ - cache_block_group(cache, trans, root, 1); + cache_block_group(cache, 1); pin_down_extent(root, cache, bytenr, num_bytes, 0); @@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); + ret = update_block_group(root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->extent_op) { if (!head->must_insert_reserved) goto out; - kfree(head->extent_op); + btrfs_free_delayed_extent_op(head->extent_op); head->extent_op = NULL; } @@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, u64 val) +static u64 stripe_align(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 val, u64 num_bytes) { - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; + u64 ret = ALIGN(val, root->stripesize); return ret; } @@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5479,7 +5662,6 @@ static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); caching_ctl = get_caching_control(cache); if (!caching_ctl) @@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) int __get_raid_index(u64 flags) { - int index; - if (flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; + return BTRFS_RAID_RAID10; else if (flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; + return BTRFS_RAID_RAID1; else if (flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; + return BTRFS_RAID_DUP; else if (flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; - else - index = 4; + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; - return index; + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } static int get_block_group_index(struct btrfs_block_group_cache *cache) @@ -5649,6 +5831,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (!block_group_bits(block_group, data)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; /* @@ -5664,8 +5848,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, cached = block_group_cache_done(block_group); if (unlikely(!cached)) { found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 0); + ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; } @@ -5678,6 +5861,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, * lets look there */ if (last_ptr) { + unsigned long aligned_cluster; /* * the refill lock keeps out other * people trying to start a new cluster @@ -5744,11 +5928,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, goto unclustered_alloc; } + aligned_cluster = max_t(unsigned long, + empty_cluster + empty_size, + block_group->full_stripe_len); + /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, search_start, num_bytes, - empty_cluster + empty_size); + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this @@ -5819,7 +6007,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, goto loop; } checks: - search_start = stripe_align(root, offset); + search_start = stripe_align(root, used_block_group, + offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > @@ -5970,7 +6159,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, if (ret == -ENOSPC) { if (!final_tried) { num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); if (num_bytes == min_alloc_size) final_tried = true; @@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); + cache_block_group(block_group, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret && !block_rsv->failfast) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", - ret); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "btrfs: block rsv returned %d\n", ret); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) { @@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + extent_op = btrfs_alloc_delayed_extent_op(); BUG_ON(!extent_op); /* -ENOMEM */ if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); @@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) root->fs_info->fs_devices->missing_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { @@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) index = get_block_group_index(block_group); } - if (index == 0) { + if (index == BTRFS_RAID_RAID10) { dev_min = 4; /* Divide by 2 */ min_free >>= 1; - } else if (index == 1) { + } else if (index == BTRFS_RAID_RAID1) { dev_min = 2; - } else if (index == 2) { + } else if (index == BTRFS_RAID_DUP) { /* Multiply by 2 */ min_free <<= 1; - } else if (index == 3) { + } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; do_div(min_free, dev_min); } @@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); + if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } } list_del(&space_info->list); kfree(space_info); @@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) btrfs_release_path(path); cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + found_key.objectid); btrfs_init_free_space_ctl(cache); /* @@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!(get_alloc_profile(root, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + chunk_offset); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); @@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 extra_flags = chunk_to_extended(flags) & BTRFS_EXTENDED_PROFILE_MASK; + write_seqlock(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) fs_info->avail_data_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_METADATA) fs_info->avail_metadata_alloc_bits &= ~extra_flags; if (flags & BTRFS_BLOCK_GROUP_SYSTEM) fs_info->avail_system_alloc_bits &= ~extra_flags; + write_sequnlock(&fs_info->profiles_lock); } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, @@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; spin_unlock(&root->fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) if (end - start >= range->minlen) { if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); + ret = cache_block_group(cache, 0); if (!ret) wait_block_group_cache_done(cache); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df29eee..f173c5af6461 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, */ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); @@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) */ static void check_page_locked(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); @@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, if (ret) err = ret; - if (did_repair) { - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); - if (ret && !err) - err = ret; - } + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; kfree(rec); return err; @@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; BUG_ON(!mirror_num); + /* we can't repair anything in raid56 yet */ + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) + return 0; + bio = bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start-page_offset(page)); + bio_add_page(bio, page, length, start - page_offset(page)); btrfsic_submit_bio(WRITE_SYNC, bio); wait_for_completion(&compl); @@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page) failrec->failed_mirror); did_repair = !ret; } + ret = 0; } out: @@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) struct page *page = bvec->bv_page; tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; end = start + bvec->bv_len - 1; if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) @@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, struct extent_io_tree *tree = bio->bi_private; u64 start; - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; @@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, return ret; } -static int merge_bio(struct extent_io_tree *tree, struct page *page, +static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(page, offset, size, bio, + ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, bio_flags); BUG_ON(ret < 0); return ret; @@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, sector; if (prev_bio_flags != bio_flags || !contig || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || + merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); @@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags) { struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; u64 cur = start; @@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } } while (cur <= end) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + if (cur >= last_byte) { char *userpage; struct extent_state *cached = NULL; @@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; sector = em->block_start >> 9; @@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag); - if (!ret) { - nr++; - *bio_flags = this_bio_flag; - } - } - if (ret) { + if (!ret) { + nr++; + *bio_flags = this_bio_flag; + } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); } @@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct inode *inode = page->mapping->host; struct extent_page_data *epd = data; struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 delalloc_start; u64 page_end = start + PAGE_CACHE_SIZE - 1; u64 end; @@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + iosize = ALIGN(iosize, blocksize); sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; @@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); + __percpu_counter_add(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); @@ -3446,15 +3437,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && - tree->ops->write_cache_pages_lock_hook) { - tree->ops->write_cache_pages_lock_hook(page, - data, flush_fn); - } else { - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); } if (unlikely(page->mapping != mapping)) { @@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset) { struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; size_t blocksize = page->mapping->host->i_sb->s_blocksize; - start += (offset + blocksize - 1) & ~(blocksize - 1); + start += ALIGN(offset, blocksize); if (start > end) return 0; @@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; int ret = 1; @@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, gfp_t mask) { struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; if ((mask & __GFP_WAIT) && @@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, len = last - offset; if (len == 0) break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); + len = ALIGN(len, sectorsize); em = get_extent(inode, NULL, 0, offset, len, 0); if (IS_ERR_OR_NULL(em)) return em; @@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb) list_del(&eb->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif - if (eb->pages && eb->pages != eb->inline_pages) - kfree(eb->pages); kmem_cache_free(extent_buffer_cache, eb); } @@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); - if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { - struct page **pages; - int num_pages = (len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - pages = kzalloc(num_pages, mask); - if (!pages) { - __free_extent_buffer(eb); - return NULL; - } - eb->pages = pages; - } else { - eb->pages = eb->inline_pages; - } + /* + * Sanity checks, currently the maximum is 64k covered by 16x 4k pages + */ + BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE + > MAX_INLINE_EXTENT_BUFFER_SIZE); + BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); return eb; } @@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static void check_buffer_tree_ref(struct extent_buffer *eb) { + int refs; /* the ref bit is tricky. We have to make sure it is set * if we have the buffer dirty. Otherwise the * code to free a buffer can end up dropping a dirty @@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * So bump the ref count first, then set the bit. If someone * beat us to it, drop the ref we added. */ + refs = atomic_read(&eb->refs); + if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) + return; + spin_lock(&eb->refs_lock); if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_inc(&eb->refs); @@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) void free_extent_buffer(struct extent_buffer *eb) { + int refs; + int old; if (!eb) return; + while (1) { + refs = atomic_read(&eb->refs); + if (refs <= 3) + break; + old = atomic_cmpxchg(&eb->refs, refs, refs - 1); + if (old == refs) + return; + } + spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfabd3263..6068a1985560 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -72,10 +72,9 @@ struct extent_io_ops { int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(struct page *page, unsigned long offset, + int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror); @@ -90,8 +89,6 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page, void *data, - void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -161,8 +158,7 @@ struct extent_buffer { */ wait_queue_head_t read_lock_wq; wait_queue_head_t lock_wq; - struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; - struct page **pages; + struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; }; static inline void extent_set_compress_type(unsigned long *bio_flags, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fdb7a8db3b57..2834ca5768ea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include "ctree.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 94aa53b38721..ec160202be3e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -684,6 +684,24 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, return ret; } +static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, + struct btrfs_sector_sum *sector_sum, + u64 total_bytes, u64 sectorsize) +{ + u64 tmp = sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while ((tmp + total_bytes) < sums->len) { + if (next_sector + sectorsize != next->bytenr) + break; + tmp += sectorsize; + next_sector = next->bytenr; + next++; + } + return tmp; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -789,20 +807,32 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, goto insert; } - if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / csum_size) { - u32 diff = (csum_offset + 1) * csum_size; + int extend_nr; + u64 tmp; + u32 diff; + u32 free_space; - /* - * is the item big enough already? we dropped our lock - * before and need to recheck - */ - if (diff < btrfs_item_size_nr(leaf, path->slots[0])) - goto csum; + if (btrfs_leaf_free_space(root, leaf) < + sizeof(struct btrfs_item) + csum_size * 2) + goto insert; + + free_space = btrfs_leaf_free_space(root, leaf) - + sizeof(struct btrfs_item) - csum_size; + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); + tmp >>= root->fs_info->sb->s_blocksize_bits; + WARN_ON(tmp < 1); + + extend_nr = max_t(int, 1, (int)tmp); + diff = (csum_offset + extend_nr) * csum_size; + diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - if (diff != csum_size) - goto insert; + diff = min(free_space, diff); + diff /= csum_size; + diff *= csum_size; btrfs_extend_item(trans, root, path, diff); goto csum; @@ -812,19 +842,14 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, btrfs_release_path(path); csum_offset = 0; if (found_next) { - u64 tmp = total_bytes + root->sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; + u64 tmp; - while (tmp < sums->len) { - if (next_sector + root->sectorsize != next->bytenr) - break; - tmp += root->sectorsize; - next_sector = next->bytenr; - next++; - } - tmp = min(tmp, next_offset - file_key.offset); + tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, + root->sectorsize); tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = min(tmp, (next_offset - file_key.offset) >> + root->fs_info->sb->s_blocksize_bits); + tmp = max((u64)1, tmp); tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); ins_size = csum_size * tmp; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b241fe9d2fe..af1d0605a5c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,11 +30,11 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "tree-log.h" #include "locking.h" @@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while(1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, + &fs_info->fs_state)) + break; + if (!__need_auto_defrag(fs_info->tree_root)) break; @@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { mutex_unlock(&inode->i_mutex); err = -EROFS; goto out; @@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags)) { - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We need to block on a committing transaction to keep us from + * throwing a ordered operation on to the list and causing + * something like sync to deadlock trying to flush out this + * inode. + */ + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); + btrfs_end_transaction(trans, root); if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(inode->i_mapping); } @@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; struct btrfs_trans_handle *trans; + bool full_sync = 0; trace_btrfs_sync_file(file, datasync); /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by - * multi-task, and make the performance up. + * multi-task, and make the performance up. See + * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); if (ret) return ret; @@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * range being left. */ atomic_inc(&root->log_batch); - btrfs_wait_ordered_range(inode, start, end - start + 1); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + if (full_sync) + btrfs_wait_ordered_range(inode, start, end - start + 1); atomic_inc(&root->log_batch); /* @@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (ret > 0) { + /* + * If we didn't already wait for ordered extents we need + * to do that now. + */ + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - start + 1); ret = btrfs_commit_transaction(trans, root); } else { ret = btrfs_sync_log(trans, root); - if (ret == 0) + if (ret == 0) { ret = btrfs_end_transaction(trans, root); - else + } else { + if (!full_sync) + btrfs_wait_ordered_range(inode, start, + end - + start + 1); ret = btrfs_commit_transaction(trans, root); + } } } else { ret = btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0be7a8742a43..1f84fc09c1a8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + max_bitmaps = max(max_bitmaps, 1); + BUG_ON(ctl->total_bitmaps > max_bitmaps); /* @@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) +find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, + unsigned long align) { struct btrfs_free_space *entry; struct rb_node *node; + u64 ctl_off; + u64 tmp; + u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) @@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) if (entry->bytes < *bytes) continue; + /* make sure the space returned is big enough + * to match our requested alignment + */ + if (*bytes >= align) { + ctl_off = entry->offset - ctl->start; + tmp = ctl_off + align - 1;; + do_div(tmp, align); + tmp = tmp * align + ctl->start; + align_off = tmp - entry->offset; + } else { + align_off = 0; + tmp = entry->offset; + } + + if (entry->bytes < *bytes + align_off) + continue; + if (entry->bitmap) { - ret = search_bitmap(ctl, entry, offset, bytes); - if (!ret) + ret = search_bitmap(ctl, entry, &tmp, bytes); + if (!ret) { + *offset = tmp; return entry; + } continue; } - *offset = entry->offset; - *bytes = entry->bytes; + *offset = tmp; + *bytes = entry->bytes - align_off; return entry; } @@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, } /* - * some block groups are so tiny they can't be enveloped by a bitmap, so - * don't even bother to create a bitmap for this + * The original block groups from mkfs can be really small, like 8 + * megabytes, so don't bother with a bitmap for those entries. However + * some block groups can be smaller than what a bitmap would cover but + * are still large enough that they could overflow the 32k memory limit, + * so allow those block groups to still be allowed to have a bitmap + * entry. */ - if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) + if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) return false; return true; @@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; + u64 align_gap = 0; + u64 align_gap_len = 0; spin_lock(&ctl->tree_lock); - entry = find_free_space(ctl, &offset, &bytes_search); + entry = find_free_space(ctl, &offset, &bytes_search, + block_group->full_stripe_len); if (!entry) goto out; @@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, if (!entry->bytes) free_bitmap(ctl, entry); } else { + unlink_free_space(ctl, entry); - entry->offset += bytes; - entry->bytes -= bytes; + align_gap_len = offset - entry->offset; + align_gap = entry->offset; + + entry->offset = offset + bytes; + WARN_ON(entry->bytes < bytes + align_gap_len); + + entry->bytes -= bytes + align_gap_len; if (!entry->bytes) kmem_cache_free(btrfs_free_space_cachep, entry); else @@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, out: spin_unlock(&ctl->tree_lock); + if (align_gap_len) + __btrfs_add_free_space(ctl, align_gap, align_gap_len); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c07b650378..c226daefd65d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -39,12 +39,13 @@ #include #include #include +#include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" @@ -54,6 +55,7 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + u64 aligned_end = ALIGN(end, root->sectorsize); u64 data_len = inline_len; int ret; @@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; @@ -389,7 +391,7 @@ static noinline int compress_file_range(struct inode *inode, * a compressed extent to 128k. */ total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -488,15 +490,13 @@ static noinline int compress_file_range(struct inode *inode, * up to a block size boundary so the allocator does sane * things */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); + total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); + total_in = ALIGN(total_in, PAGE_CACHE_SIZE); if (total_compressed >= total_in) { will_compress = 0; } else { @@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - +again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, struct async_extent, list); @@ -648,6 +648,8 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, btrfs_get_extent, WB_SYNC_ALL); + else if (ret) + unlock_page(async_cow->locked_page); kfree(async_extent); cond_resched(); continue; @@ -672,6 +674,7 @@ static noinline int submit_compressed_extents(struct inode *inode, if (ret) { int i; + for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); page_cache_release(async_extent->pages[i]); @@ -679,12 +682,10 @@ static noinline int submit_compressed_extents(struct inode *inode, kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (ret == -ENOSPC) goto retry; - goto out_free; /* JDM: Requeue? */ + goto out_free; } /* @@ -696,10 +697,13 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, 0); em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) + goto out_free_reserve; em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -726,6 +730,9 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->ram_size - 1, 0); } + if (ret) + goto out_free_reserve; + ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, @@ -733,7 +740,8 @@ static noinline int submit_compressed_extents(struct inode *inode, ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_free_reserve; /* * clear dirty, set writeback and unlock the pages. @@ -754,18 +762,30 @@ static noinline int submit_compressed_extents(struct inode *inode, ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); - - BUG_ON(ret); /* -ENOMEM */ alloc_hint = ins.objectid + ins.offset; kfree(async_extent); + if (ret) + goto out; cond_resched(); } ret = 0; out: return ret; +out_free_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); kfree(async_extent); - goto out; + goto again; } static u64 get_extent_allocation_hint(struct inode *inode, u64 start, @@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BUG_ON(btrfs_is_free_space_inode(inode)); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->orig_start = em->start; ram_size = ins.offset; em->len = ins.offset; + em->mod_start = em->start; + em->mod_len = em->len; em->block_start = ins.objectid; em->block_len = ins.offset; @@ -1338,6 +1360,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->mod_start = em->start; + em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_FILLING, &em->flags); em->generation = -1; @@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } - spin_lock(&root->fs_info->delalloc_lock); + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); + if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, && do_list) btrfs_free_reserved_data_space(inode, len); - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; + __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, + root->fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); + test_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags)) { + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &BTRFS_I(inode)->runtime_flags); + } + spin_unlock(&root->fs_info->delalloc_lock); } - spin_unlock(&root->fs_info->delalloc_lock); + spin_unlock(&BTRFS_I(inode)->lock); } } @@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, READ, logical, + ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); @@ -1892,6 +1931,640 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, return ret; } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + WARN_ON(1); + return ret; + } + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + if (key.offset - extent_offset != offset) + continue; + + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <= + old->extent_offset + old->offset) + continue; + + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset + extent_offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr, fs_info, + path, record_one_backref, + old); + BUG_ON(ret < 0 && ret != -ENOENT); + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 disk_bytenr) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + int index; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + if (btrfs_root_refs(&root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* parse ENOENT to 0 */ + return 0; + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + return 0; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) != + backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (relink_is_mergable(leaf, fi, new->bytenr) && + extent_len + found_key.offset == start) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct old_sa_defrag_extent *old, *tmp; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out: + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); + + kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old, *tmp; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_list; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_list; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_list: + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out_free_path: + btrfs_free_path(path); +out_kfree: + kfree(new); + return NULL; +} + /* * helper function for btrfs_finish_ordered_io, this * just reads in some of the csum leaves to prime them into ram @@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret; bool nolock; @@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state); + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + if (nolock) trans = btrfs_join_transaction_nolock(root); else @@ -2001,17 +2689,33 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (trans) btrfs_end_transaction(trans, root); - if (ret) + if (ret) { clear_extent_uptodate(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, NULL, GFP_NOFS); + /* + * If the ordered extent had an IOERR or something else went + * wrong we need to return the space for this ordered extent + * back to the allocator. + */ + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(root, ordered_extent->start, + ordered_extent->disk_len); + } + + /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); + /* for snapshot-aware defrag */ + if (new) + relink_file_extents(new); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; @@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } } -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - /* * This is called in transaction commit time. If there are no orphan * files in the subvolume, it removes orphan item and frees block_rsv @@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); + atomic_inc(&root->orphan_inodes); /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { @@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; ret = btrfs_truncate(inode); + if (ret) + btrfs_orphan_del(NULL, inode); } else { nr_unlink++; } @@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + struct btrfs_map_token token; - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); + btrfs_init_map_token(&token); - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); + btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); + btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); + btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, + &token); + btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); + btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec, &token); - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, inode->i_version); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); + btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec, &token); + + btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec, &token); + + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), + &token); + btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, + &token); + btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); + btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); + btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); + btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); + btrfs_set_token_inode_block_group(leaf, item, 0, &token); } /* @@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 mask = root->sectorsize - 1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); + btrfs_drop_extent_cache(inode, ALIGN(new_size, + root->sectorsize), (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -3407,10 +4116,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); + extent_num_bytes = ALIGN(new_size - + found_key.offset, + root->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; + u64 hole_start = ALIGN(oldsize, root->sectorsize); + u64 block_end = ALIGN(size, root->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; + last_byte = ALIGN(last_byte , root->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); + + /* Disable nonlocked read DIO to avoid the end less truncate */ + btrfs_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(inode); + ret = btrfs_truncate(inode); if (ret && inode->i_nlink) btrfs_orphan_del(NULL, inode); @@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); @@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - trans = btrfs_start_transaction_lflush(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); @@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root); @@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } insert_inode_hash(inode); @@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_update_inode(trans, root, inode); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5396,8 +6107,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + extent_end = ALIGN(extent_start + size, root->sectorsize); } if (start >= extent_end) { @@ -5469,8 +6179,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); + em->len = ALIGN(copy_size, root->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; if (compress_type) { @@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->start = start; em->orig_start = orig_start; + em->mod_start = start; + em->mod_len = len; em->len = len; em->block_len = block_len; em->block_start = block_start; @@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 len = bh_result->b_size; struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; - int ret; + int ret = 0; - if (create) { - ret = btrfs_delalloc_reserve_space(inode, len); - if (ret) - return ret; + if (create) unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; - } else { + else len = min_t(u64, len, root->sectorsize); - } lockstart = start; lockend = start + len - 1; @@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) return -ENOTBLK; - if (create) { - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - if (ret) - goto unlock_err; - } - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { ret = PTR_ERR(em); @@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (!create && (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { free_extent_map(em); - ret = 0; goto unlock_err; } @@ -6148,6 +6846,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); + + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockstart + len - 1, EXTENT_DELALLOC, NULL, + &cached_state, GFP_NOFS); + BUG_ON(ret); } /* @@ -6156,24 +6863,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * aren't using if there is any left over space. */ if (lockstart < lockend) { - if (create && len < lockend - lockstart) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, - unlock_bits | EXTENT_DEFRAG, 1, 0, - &cached_state, GFP_NOFS); - /* - * Beside unlock, we also need to cleanup reserved space - * for the left range by attaching EXTENT_DO_ACCOUNTING. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, - lockstart + len, lockend, - unlock_bits | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); - } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); - } + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, unlock_bits, 1, 0, + &cached_state, GFP_NOFS); } else { free_extent_state(cached_state); } @@ -6183,9 +6875,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return 0; unlock_err: - if (create) - unlock_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); return ret; @@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int async_submit = 0; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { bio_put(orig_bio); return -EIO; } - if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; } - async_submit = 1; + /* async crcs make it difficult to collect full stripe writes. */ + if (btrfs_get_alloc_profile(root, 1) & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + async_submit = 0; + else + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; @@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_end_io = btrfs_end_dio_bio; map_length = orig_bio->bi_size; - ret = btrfs_map_block(root->fs_info, READ, + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, offset, nr_segs)) return 0; - return __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); + atomic_inc(&inode->i_dio_count); + smp_mb__after_atomic_inc(); + + if (rw & WRITE) { + count = iov_length(iov, nr_segs); + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + mutex_unlock(&inode->i_mutex); + relock = true; + } + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags))) { + inode_dio_done(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (rw & WRITE) { + if (ret < 0 && ret != -EIOCBQUEUED) + btrfs_delalloc_release_space(inode, count); + else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, + count - (size_t)ret); + else + btrfs_delalloc_release_metadata(inode, 0); + } +out: + if (wakeup) + inode_dio_done(inode); + if (relock) + mutex_lock(&inode->i_mutex); + + return ret; } #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) @@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) return; } lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, - page_offset(page)); + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); if (ordered) { /* * IO on this page will never be started, so we need @@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(inode)) + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); @@ -7299,40 +8038,22 @@ int btrfs_init_cachep(void) static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + u64 delalloc_bytes; struct inode *inode = dentry->d_inode; u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; + + spin_lock(&BTRFS_I(inode)->lock); + delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ - struct btrfs_inode *b_dir = BTRFS_I(dir); - struct btrfs_inode *b_inode = BTRFS_I(inode); - - if (b_dir->flags & BTRFS_INODE_NODATACOW) - b_inode->flags |= BTRFS_INODE_NODATACOW; - else - b_inode->flags &= ~BTRFS_INODE_NODATACOW; - - if (b_dir->flags & BTRFS_INODE_COMPRESS) { - b_inode->flags |= BTRFS_INODE_COMPRESS; - b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; - } else { - b_inode->flags &= ~(BTRFS_INODE_COMPRESS | - BTRFS_INODE_NOCOMPRESS); - } -} - static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - fixup_inode_flags(new_dir, old_inode); - ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); @@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); -again: + spin_lock(&root->fs_info->delalloc_lock); list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { @@ -7593,8 +8312,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); inode = igrab(&binode->vfs_inode); - if (!inode) + if (!inode) { + clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, + &binode->runtime_flags); continue; + } list_add_tail(&binode->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -7619,13 +8341,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) btrfs_wait_and_free_delalloc_work(work); } - spin_lock(&root->fs_info->delalloc_lock); - if (!list_empty(&root->fs_info->delalloc_inodes)) { - spin_unlock(&root->fs_info->delalloc_lock); - goto again; - } - spin_unlock(&root->fs_info->delalloc_lock); - /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(trans, root, + min(num_bytes, 256ULL * 1024 * 1024), + min_size, 0, *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c3f09f71bedd..c83086fdda05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -42,12 +42,12 @@ #include #include #include +#include #include "compat.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "volumes.h" #include "locking.h" @@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } -static noinline int create_subvol(struct btrfs_root *root, +static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, u64 *async_transid, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; - struct dentry *parent = dentry->d_parent; - struct inode *dir; + struct btrfs_block_rsv block_rsv; struct timespec cur_time = CURRENT_TIME; int ret; int err; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + u64 qgroup_reserved; uuid_le new_uuid; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) return ret; - dir = parent->d_inode; - + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items + * The same as the snapshot creation, please see the comment + * of create_snapshot(). */ - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return PTR_ERR(trans); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 7, &qgroup_reserved); + if (ret) + return ret; - ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, - inherit ? *inherit : NULL); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; + + ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); fail: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -527,13 +535,15 @@ static noinline int create_subvol(struct btrfs_root *root, if (!ret) d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); - +out: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid, - bool readonly, struct btrfs_qgroup_inherit **inherit) +static int create_snapshot(struct btrfs_root *root, struct inode *dir, + struct dentry *dentry, char *name, int namelen, + u64 *async_transid, bool readonly, + struct btrfs_qgroup_inherit *inherit) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * 1 - parent dir inode + * 2 - dir entries + * 1 - root item + * 2 - root ref/backref + * 1 - root of snapshot + */ + ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, 7, + &pending_snapshot->qgroup_reserved); + if (ret) + goto out; + pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; - if (inherit) { - pending_snapshot->inherit = *inherit; - *inherit = NULL; /* take responsibility to free it */ - } + pending_snapshot->dir = dir; + pending_snapshot->inherit = inherit; - trans = btrfs_start_transaction(root->fs_info->extent_root, 6); + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } - ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); - BUG_ON(ret); - spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); @@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + &pending_snapshot->block_rsv, + pending_snapshot->qgroup_reserved); +out: kfree(pending_snapshot); return ret; } @@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, name, namelen, + error = create_snapshot(snap_src, dir, dentry, name, namelen, async_transid, readonly, inherit); } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid, inherit); + error = create_subvol(dir, dentry, name, namelen, + async_transid, inherit); } if (!error) fsnotify_mkdir(dir, dentry); @@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root, while(1) { ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, newer_than); + path, newer_than); if (ret != 0) goto none; if (min_key.objectid != ino) @@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); + ret = -EAGAIN; + break; + } + if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, range->flags & @@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, int ret = 0; int mod = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); + if (!devid) { + ret = -EINVAL; + goto out_free; + } printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } @@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -ENODEV; goto out_free; } @@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizer unable to apply on " "readonly device %llu\n", (unsigned long long)devid); - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (device->is_tgtdev_for_dev_replace) { - ret = -EINVAL; + ret = -EPERM; goto out_free; } @@ -1457,7 +1486,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, u64 *transid, bool readonly, - struct btrfs_qgroup_inherit **inherit) + struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; @@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, - readonly, &inherit); + readonly, inherit); if (ret == 0 && ptr && copy_to_user(arg + @@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode, path->keep_locks = 1; while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0, + ret = btrfs_search_forward(root, &key, &max_key, path, sk->min_transid); if (ret != 0) { if (ret > 0) @@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; + struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; int namelen; int ret; int err = 0; @@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); + /* + * One for dir inode, two for dir entries, two for root + * ref/backref. + */ + err = btrfs_subvolume_reserve_metadata(root, &block_rsv, + 5, &qgroup_reserved); + if (err) + goto out_up_write; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto out_up_write; + goto out_release; } - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &block_rsv; + trans->bytes_reserved = block_rsv.size; ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, @@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } } out_end_trans: + trans->block_rsv = NULL; + trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans, root); if (ret && !err) err = ret; inode->i_flags |= S_DEAD; +out_release: + btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); out_up_write: up_write(&root->fs_info->subvol_sem); out_unlock: @@ -2171,6 +2217,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); + + /* the last ref */ + if (dest->cache_inode) { + iput(dest->cache_inode); + dest->cache_inode = NULL; + } } out_dput: dput(dentry); @@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - ret = btrfs_defrag_root(root, 0); + ret = btrfs_defrag_root(root); if (ret) goto out; - ret = btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root->fs_info->extent_root); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, u64 transid; int ret; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); @@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) struct inode_fs_paths *ipath = NULL; struct btrfs_path *path; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; path = btrfs_alloc_path(); @@ -3914,6 +3966,65 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, return ret; } +static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + const char *label = root->fs_info->super_copy->label; + size_t len = strnlen(label, BTRFS_LABEL_SIZE); + int ret; + + if (len == BTRFS_LABEL_SIZE) { + pr_warn("btrfs: label is too long, return the first %zu bytes\n", + --len); + } + + mutex_lock(&root->fs_info->volume_mutex); + ret = copy_to_user(arg, label, len); + mutex_unlock(&root->fs_info->volume_mutex); + + return ret ? -EFAULT : 0; +} + +static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + char label[BTRFS_LABEL_SIZE]; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, arg, sizeof(label))) + return -EFAULT; + + if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { + pr_err("btrfs: unable to set label with more than %d bytes\n", + BTRFS_LABEL_SIZE - 1); + return -EINVAL; + } + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + mutex_lock(&root->fs_info->volume_mutex); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + strcpy(super_block->label, label); + ret = btrfs_end_transaction(trans, root); + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); + mnt_drop_write_file(file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); + case BTRFS_IOC_GET_FSLABEL: + return btrfs_ioctl_get_fslabel(file, argp); + case BTRFS_IOC_SET_FSLABEL: + return btrfs_ioctl_set_fslabel(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c66041..e95df435d897 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -113,11 +113,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) read_unlock(&eb->lock); return; } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); - read_lock(&eb->lock); if (atomic_read(&eb->blocking_writers)) { read_unlock(&eb->lock); + wait_event(eb->write_lock_wq, + atomic_read(&eb->blocking_writers) == 0); goto again; } atomic_inc(&eb->read_locks); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e5ed56729607..dc08d77b717e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && + !(type == BTRFS_ORDERED_NOCOW)) + entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); + INIT_LIST_HEAD(&entry->log_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); + WARN_ON(entry->csum_bytes_left < sum->len); + entry->csum_bytes_left -= sum->len; + if (entry->csum_bytes_left == 0) + wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -405,6 +413,66 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, return ret == 0; } +/* Needs to either be called under a log transaction or the log_mutex */ +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) +{ + struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_extent *ordered; + struct rb_node *n; + int index = log->log_transid % 2; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + spin_lock(&log->log_extents_lock[index]); + if (list_empty(&ordered->log_list)) { + list_add_tail(&ordered->log_list, &log->logged_list[index]); + atomic_inc(&ordered->refs); + } + spin_unlock(&log->log_extents_lock[index]); + } + spin_unlock_irq(&tree->lock); +} + +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) +{ + struct btrfs_ordered_extent *ordered; + int index = transid % 2; + + spin_lock_irq(&log->log_extents_lock[index]); + while (!list_empty(&log->logged_list[index])) { + ordered = list_first_entry(&log->logged_list[index], + struct btrfs_ordered_extent, + log_list); + list_del_init(&ordered->log_list); + spin_unlock_irq(&log->log_extents_lock[index]); + btrfs_put_ordered_extent(ordered); + spin_lock_irq(&log->log_extents_lock[index]); + } + spin_unlock_irq(&log->log_extents_lock[index]); +} + /* * used to drop a reference on an ordered extent. This will free * the extent if the last reference is dropped @@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; + struct btrfs_transaction *cur_trans = trans->transaction; struct list_head splice; struct list_head works; struct btrfs_delalloc_work *work, *next; @@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - + list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); - inode = &btrfs_inode->vfs_inode; list_del_init(&btrfs_inode->ordered_operations); @@ -574,24 +640,22 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) * the inode may be getting freed (in sys_unlink path). */ inode = igrab(inode); - - if (!wait && inode) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - if (!inode) continue; + + if (!wait) + list_add_tail(&BTRFS_I(inode)->ordered_operations, + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); work = btrfs_alloc_delalloc_work(inode, wait, 1); if (!work) { + spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) list_add_tail(&btrfs_inode->ordered_operations, &splice); - spin_lock(&root->fs_info->ordered_extent_lock); list_splice_tail(&splice, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); spin_unlock(&root->fs_info->ordered_extent_lock); ret = -ENOMEM; goto out; @@ -603,9 +667,6 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - spin_unlock(&root->fs_info->ordered_extent_lock); out: list_for_each_entry_safe(work, next, &works, list) { @@ -974,6 +1035,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { + struct btrfs_transaction *cur_trans = trans->transaction; u64 last_mod; last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); @@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); + &cur_trans->ordered_operations); } spin_unlock(&root->fs_info->ordered_extent_lock); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5fbe7..8eadfe406cdd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -79,6 +79,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent * has done its due diligence in updating * the isize. */ +#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered + ordered extent */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -96,6 +98,9 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; + /* number of bytes that still need csumming */ + u64 csum_bytes_left; + /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -118,6 +123,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* If we need to wait on this to be done */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); +void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); +void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd190a5..920957ecb27e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + break; case BTRFS_DEV_STATS_KEY: printk(KERN_INFO "\t\tdevice stats\n"); break; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c856234323..aee4b1cc3d98 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,13 +23,13 @@ #include #include #include +#include #include "ctree.h" #include "transaction.h" #include "disk-io.h" #include "locking.h" #include "ulist.h" -#include "ioctl.h" #include "backref.h" /* TODO XXX FIXME @@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.offset = qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, key.offset = 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret > 0) ret = -ENOENT; @@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_key key; + struct extent_buffer *leaf = NULL; int ret; - - if (!root) - return -EINVAL; + int nr = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->leave_spinning = 1; + + key.objectid = 0; + key.offset = 0; + key.type = 0; + while (1) { - key.objectid = 0; - key.offset = 0; - key.type = 0; - - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } else if (ret < 0) { + if (ret < 0) + goto out; + leaf = path->nodes[0]; + nr = btrfs_header_nritems(leaf); + if (!nr) break; - } - - ret = btrfs_del_item(trans, root, path); + /* + * delete the leaf one by one + * since the whole tree is going + * to be deleted. + */ + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) goto out; + btrfs_release_path(path); } ret = 0; @@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, int ret = 0; spin_lock(&fs_info->qgroup_lock); + if (!fs_info->quota_root) { + spin_unlock(&fs_info->qgroup_lock); + return 0; + } fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..07222053c7d8 --- /dev/null +++ b/fs/btrfs/raid56.c @@ -0,0 +1,2099 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "rcu-string.h" + +/* set when additional merges to this rbio are not allowed */ +#define RBIO_RMW_LOCKED_BIT 1 + +/* + * set when this rbio is sitting in the hash, but it is just a cache + * of past RMW + */ +#define RBIO_CACHE_BIT 2 + +/* + * set when it is safe to trust the stripe_pages for caching + */ +#define RBIO_CACHE_READY_BIT 3 + + +#define RBIO_CACHE_SIZE 1024 + +struct btrfs_raid_bio { + struct btrfs_fs_info *fs_info; + struct btrfs_bio *bbio; + + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; + + /* while we're doing rmw on a stripe + * we put it into a hash table so we can + * lock the stripe and merge more rbios + * into it. + */ + struct list_head hash_list; + + /* + * LRU list for the stripe cache + */ + struct list_head stripe_cache; + + /* + * for scheduling work in the helper threads + */ + struct btrfs_work work; + + /* + * bio list and bio_list_lock are used + * to add more bios into the stripe + * in hopes of avoiding the full rmw + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* also protected by the bio_list_lock, the + * plug list is used by the plugging code + * to collect partial bios while plugged. The + * stripe locking code also uses it to hand off + * the stripe lock to the next pending IO + */ + struct list_head plug_list; + + /* + * flags that tell us if it is safe to + * merge with this bio + */ + unsigned long flags; + + /* size of each individual stripe on disk */ + int stripe_len; + + /* number of data stripes (no p/q) */ + int nr_data; + + /* + * set if we're doing a parity rebuild + * for a read from higher up, which is handled + * differently from a parity rebuild as part of + * rmw + */ + int read_rebuild; + + /* first bad stripe */ + int faila; + + /* second bad stripe (for raid6 use) */ + int failb; + + /* + * number of pages needed to represent the full + * stripe + */ + int nr_pages; + + /* + * size of all the bios in the bio_list. This + * helps us decide if the rbio maps to a full + * stripe or not + */ + int bio_list_bytes; + + atomic_t refs; + + /* + * these are two arrays of pointers. We allocate the + * rbio big enough to hold them both and setup their + * locations when the rbio is allocated + */ + + /* pointers to pages that we allocated for + * reading/writing stripes directly from the disk (including P/Q) + */ + struct page **stripe_pages; + + /* + * pointers to the pages in the bio_list. Stored + * here for faster lookup + */ + struct page **bio_pages; +}; + +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); +static noinline void finish_rmw(struct btrfs_raid_bio *rbio); +static void rmw_work(struct btrfs_work *work); +static void read_rebuild_work(struct btrfs_work *work); +static void async_rmw_stripe(struct btrfs_raid_bio *rbio); +static void async_read_rebuild(struct btrfs_raid_bio *rbio); +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); +static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void index_rbio_pages(struct btrfs_raid_bio *rbio); +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); + +/* + * the stripe hash table is used for locking, and to collect + * bios in hopes of making a full stripe + */ +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash_table *x; + struct btrfs_stripe_hash *cur; + struct btrfs_stripe_hash *h; + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; + int i; + int table_size; + + if (info->stripe_hash_table) + return 0; + + /* + * The table is large, starting with order 4 and can go as high as + * order 7 in case lock debugging is turned on. + * + * Try harder to allocate and fallback to vmalloc to lower the chance + * of a failing mount. + */ + table_size = sizeof(*table) + sizeof(*h) * num_entries; + table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!table) { + table = vzalloc(table_size); + if (!table) + return -ENOMEM; + } + + spin_lock_init(&table->cache_lock); + INIT_LIST_HEAD(&table->stripe_cache); + + h = table->table; + + for (i = 0; i < num_entries; i++) { + cur = h + i; + INIT_LIST_HEAD(&cur->hash_list); + spin_lock_init(&cur->lock); + init_waitqueue_head(&cur->wait); + } + + x = cmpxchg(&info->stripe_hash_table, NULL, table); + if (x) { + if (is_vmalloc_addr(x)) + vfree(x); + else + kfree(x); + } + return 0; +} + +/* + * caching an rbio means to copy anything from the + * bio_pages array into the stripe_pages array. We + * use the page uptodate bit in the stripe cache array + * to indicate if it has valid data + * + * once the caching is done, we set the cache ready + * bit. + */ +static void cache_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + char *s; + char *d; + int ret; + + ret = alloc_rbio_pages(rbio); + if (ret) + return; + + for (i = 0; i < rbio->nr_pages; i++) { + if (!rbio->bio_pages[i]) + continue; + + s = kmap(rbio->bio_pages[i]); + d = kmap(rbio->stripe_pages[i]); + + memcpy(d, s, PAGE_CACHE_SIZE); + + kunmap(rbio->bio_pages[i]); + kunmap(rbio->stripe_pages[i]); + SetPageUptodate(rbio->stripe_pages[i]); + } + set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); +} + +/* + * we hash on the first logical address of the stripe + */ +static int rbio_bucket(struct btrfs_raid_bio *rbio) +{ + u64 num = rbio->raid_map[0]; + + /* + * we shift down quite a bit. We're using byte + * addressing, and most of the lower bits are zeros. + * This tends to upset hash_64, and it consistently + * returns just one or two different values. + * + * shifting off the lower bits fixes things. + */ + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); +} + +/* + * stealing an rbio means taking all the uptodate pages from the stripe + * array in the source rbio and putting them into the destination rbio + */ +static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) +{ + int i; + struct page *s; + struct page *d; + + if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) + return; + + for (i = 0; i < dest->nr_pages; i++) { + s = src->stripe_pages[i]; + if (!s || !PageUptodate(s)) { + continue; + } + + d = dest->stripe_pages[i]; + if (d) + __free_page(d); + + dest->stripe_pages[i] = s; + src->stripe_pages[i] = NULL; + } +} + +/* + * merging means we take the bio_list from the victim and + * splice it into the destination. The victim should + * be discarded afterwards. + * + * must be called with dest->rbio_list_lock held + */ +static void merge_rbio(struct btrfs_raid_bio *dest, + struct btrfs_raid_bio *victim) +{ + bio_list_merge(&dest->bio_list, &victim->bio_list); + dest->bio_list_bytes += victim->bio_list_bytes; + bio_list_init(&victim->bio_list); +} + +/* + * used to prune items that are in the cache. The caller + * must hold the hash table lock. + */ +static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash_table *table; + struct btrfs_stripe_hash *h; + int freeit = 0; + + /* + * check the bit again under the hash table lock. + */ + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + h = table->table + bucket; + + /* hold the lock for the bucket because we may be + * removing it from the hash table + */ + spin_lock(&h->lock); + + /* + * hold the lock for the bio list because we need + * to make sure the bio list is empty + */ + spin_lock(&rbio->bio_list_lock); + + if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { + list_del_init(&rbio->stripe_cache); + table->cache_size -= 1; + freeit = 1; + + /* if the bio list isn't empty, this rbio is + * still involved in an IO. We take it out + * of the cache list, and drop the ref that + * was held for the list. + * + * If the bio_list was empty, we also remove + * the rbio from the hash_table, and drop + * the corresponding ref + */ + if (bio_list_empty(&rbio->bio_list)) { + if (!list_empty(&rbio->hash_list)) { + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + BUG_ON(!list_empty(&rbio->plug_list)); + } + } + } + + spin_unlock(&rbio->bio_list_lock); + spin_unlock(&h->lock); + + if (freeit) + __free_raid_bio(rbio); +} + +/* + * prune a given rbio from the cache + */ +static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + __remove_rbio_from_cache(rbio); + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove everything in the cache + */ +void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } + spin_unlock_irqrestore(&table->cache_lock, flags); +} + +/* + * remove all cached entries and free the hash table + * used by unmount + */ +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) +{ + if (!info->stripe_hash_table) + return; + btrfs_clear_rbio_cache(info); + if (is_vmalloc_addr(info->stripe_hash_table)) + vfree(info->stripe_hash_table); + else + kfree(info->stripe_hash_table); + info->stripe_hash_table = NULL; +} + +/* + * insert an rbio into the stripe cache. It + * must have already been prepared by calling + * cache_rbio_pages + * + * If this rbio was already cached, it gets + * moved to the front of the lru. + * + * If the size of the rbio cache is too big, we + * prune an item. + */ +static void cache_rbio(struct btrfs_raid_bio *rbio) +{ + struct btrfs_stripe_hash_table *table; + unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->fs_info->stripe_hash_table; + + spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ + if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) + atomic_inc(&rbio->refs); + + if (!list_empty(&rbio->stripe_cache)){ + list_move(&rbio->stripe_cache, &table->stripe_cache); + } else { + list_add(&rbio->stripe_cache, &table->stripe_cache); + table->cache_size += 1; + } + + spin_unlock(&rbio->bio_list_lock); + + if (table->cache_size > RBIO_CACHE_SIZE) { + struct btrfs_raid_bio *found; + + found = list_entry(table->stripe_cache.prev, + struct btrfs_raid_bio, + stripe_cache); + + if (found != rbio) + __remove_rbio_from_cache(found); + } + + spin_unlock_irqrestore(&table->cache_lock, flags); + return; +} + +/* + * helper function to run the xor_blocks api. It is only + * able to do MAX_XOR_BLOCKS at a time, so we need to + * loop through. + */ +static void run_xor(void **pages, int src_cnt, ssize_t len) +{ + int src_off = 0; + int xor_src_cnt = 0; + void *dest = pages[src_cnt]; + + while(src_cnt > 0) { + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + xor_blocks(xor_src_cnt, len, dest, pages + src_off); + + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + } +} + +/* + * returns true if the bio list inside this rbio + * covers an entire stripe (no rmw required). + * Must be called with the bio list lock held, or + * at a time when you know it is impossible to add + * new bios into the list + */ +static int __rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + + if (size != rbio->nr_data * rbio->stripe_len) + ret = 0; + + BUG_ON(size > rbio->nr_data * rbio->stripe_len); + return ret; +} + +static int rbio_is_full(struct btrfs_raid_bio *rbio) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + ret = __rbio_is_full(rbio); + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + return ret; +} + +/* + * returns 1 if it is safe to merge two rbios together. + * The merging is safe if the two rbios correspond to + * the same stripe and if they are both going in the same + * direction (read vs write), and if neither one is + * locked for final IO + * + * The caller is responsible for locking such that + * rmw_locked is safe to test + */ +static int rbio_can_merge(struct btrfs_raid_bio *last, + struct btrfs_raid_bio *cur) +{ + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) + return 0; + + /* + * we can't merge with cached rbios, since the + * idea is that when we merge the destination + * rbio is going to run our IO for us. We can + * steal from cached rbio's though, other functions + * handle that. + */ + if (test_bit(RBIO_CACHE_BIT, &last->flags) || + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + + if (last->raid_map[0] != + cur->raid_map[0]) + return 0; + + /* reads can't merge with writes */ + if (last->read_rebuild != + cur->read_rebuild) { + return 0; + } + + return 1; +} + +/* + * helper to index into the pstripe + */ +static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * helper to index into the qstripe, returns null + * if there is no qstripe + */ +static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +{ + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + return NULL; + + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> + PAGE_CACHE_SHIFT; + return rbio->stripe_pages[index]; +} + +/* + * The first stripe in the table for a logical address + * has the lock. rbios are added in one of three ways: + * + * 1) Nobody has the stripe locked yet. The rbio is given + * the lock and 0 is returned. The caller must start the IO + * themselves. + * + * 2) Someone has the stripe locked, but we're able to merge + * with the lock owner. The rbio is freed and the IO will + * start automatically along with the existing rbio. 1 is returned. + * + * 3) Someone has the stripe locked, but we're not able to merge. + * The rbio is added to the lock owner's plug list, or merged into + * an rbio already on the plug list. When the lock owner unlocks, + * the next rbio on the list is run and the IO is started automatically. + * 1 is returned + * + * If we return 0, the caller still owns the rbio and must continue with + * IO submission. If we return 1, the caller must assume the rbio has + * already been freed. + */ +static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) +{ + int bucket = rbio_bucket(rbio); + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; + unsigned long flags; + DEFINE_WAIT(wait); + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + int walk = 0; + + spin_lock_irqsave(&h->lock, flags); + list_for_each_entry(cur, &h->hash_list, hash_list) { + walk++; + if (cur->raid_map[0] == rbio->raid_map[0]) { + spin_lock(&cur->bio_list_lock); + + /* can we steal this cached rbio's pages? */ + if (bio_list_empty(&cur->bio_list) && + list_empty(&cur->plug_list) && + test_bit(RBIO_CACHE_BIT, &cur->flags) && + !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { + list_del_init(&cur->hash_list); + atomic_dec(&cur->refs); + + steal_rbio(cur, rbio); + cache_drop = cur; + spin_unlock(&cur->bio_list_lock); + + goto lockit; + } + + /* can we merge into the lock owner? */ + if (rbio_can_merge(cur, rbio)) { + merge_rbio(cur, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + + + /* + * we couldn't merge with the running + * rbio, see if we can merge with the + * pending ones. We don't have to + * check for rmw_locked because there + * is no way they are inside finish_rmw + * right now + */ + list_for_each_entry(pending, &cur->plug_list, + plug_list) { + if (rbio_can_merge(pending, rbio)) { + merge_rbio(pending, rbio); + spin_unlock(&cur->bio_list_lock); + freeit = rbio; + ret = 1; + goto out; + } + } + + /* no merging, put us on the tail of the plug list, + * our rbio will be started with the currently + * running rbio unlocks + */ + list_add_tail(&rbio->plug_list, &cur->plug_list); + spin_unlock(&cur->bio_list_lock); + ret = 1; + goto out; + } + } +lockit: + atomic_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); +out: + spin_unlock_irqrestore(&h->lock, flags); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) + __free_raid_bio(freeit); + return ret; +} + +/* + * called as rmw or parity rebuild is completed. If the plug list has more + * rbios waiting for this stripe, the next one on the list will be started + */ +static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) +{ + int bucket; + struct btrfs_stripe_hash *h; + unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); + h = rbio->fs_info->stripe_hash_table->table + bucket; + + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + + spin_lock_irqsave(&h->lock, flags); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { + /* + * if we're still cached and there is no other IO + * to perform, just leave this rbio here for others + * to steal from later + */ + if (list_empty(&rbio->plug_list) && + test_bit(RBIO_CACHE_BIT, &rbio->flags)) { + keep_cache = 1; + clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + BUG_ON(!bio_list_empty(&rbio->bio_list)); + goto done; + } + + list_del_init(&rbio->hash_list); + atomic_dec(&rbio->refs); + + /* + * we use the plug list to hold all the rbios + * waiting for the chance to lock this stripe. + * hand the lock over to one of them. + */ + if (!list_empty(&rbio->plug_list)) { + struct btrfs_raid_bio *next; + struct list_head *head = rbio->plug_list.next; + + next = list_entry(head, struct btrfs_raid_bio, + plug_list); + + list_del_init(&rbio->plug_list); + + list_add(&next->hash_list, &h->hash_list); + atomic_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + + if (next->read_rebuild) + async_read_rebuild(next); + else { + steal_rbio(rbio, next); + async_rmw_stripe(next); + } + + goto done_nolock; + } else if (waitqueue_active(&h->wait)) { + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + wake_up(&h->wait); + goto done_nolock; + } + } +done: + spin_unlock(&rbio->bio_list_lock); + spin_unlock_irqrestore(&h->lock, flags); + +done_nolock: + if (!keep_cache) + remove_rbio_from_cache(rbio); +} + +static void __free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + WARN_ON(atomic_read(&rbio->refs) < 0); + if (!atomic_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + kfree(rbio->raid_map); + kfree(rbio->bbio); + kfree(rbio); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + unlock_stripe(rbio); + __free_raid_bio(rbio); +} + +/* + * this frees the rbio and runs through all the bios in the + * bio_list and calls end_io on them + */ +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +{ + struct bio *cur = bio_list_get(&rbio->bio_list); + struct bio *next; + free_raid_bio(rbio); + + while (cur) { + next = cur->bi_next; + cur->bi_next = NULL; + if (uptodate) + set_bit(BIO_UPTODATE, &cur->bi_flags); + bio_endio(cur, err); + cur = next; + } +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); + return; +} + +/* + * the read/modify/write code wants to use the original bio for + * any pages it included, and then use the rbio for everything + * else. This function decides if a given index (stripe number) + * and page number in that stripe fall inside the original bio + * or the rbio. + * + * if you set bio_list_only, you'll get a NULL back for any ranges + * that are outside the bio_list + * + * This doesn't take any refs on anything, you get a bare page pointer + * and the caller must bump refs as required. + * + * You must call index_rbio_pages once before you can trust + * the answers from this function. + */ +static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, + int index, int pagenr, int bio_list_only) +{ + int chunk_page; + struct page *p = NULL; + + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + + spin_lock_irq(&rbio->bio_list_lock); + p = rbio->bio_pages[chunk_page]; + spin_unlock_irq(&rbio->bio_list_lock); + + if (p || bio_list_only) + return p; + + return rbio->stripe_pages[chunk_page]; +} + +/* + * number of pages we need for the entire stripe across all the + * drives + */ +static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) +{ + unsigned long nr = stripe_len * nr_stripes; + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +} + +/* + * allocation and initial setup for the btrfs_raid_bio. Not + * this does not allocate any pages for rbio->pages. + */ +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + int nr_data = 0; + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + void *p; + + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + GFP_NOFS); + if (!rbio) { + kfree(raid_map); + kfree(bbio); + return ERR_PTR(-ENOMEM); + } + + bio_list_init(&rbio->bio_list); + INIT_LIST_HEAD(&rbio->plug_list); + spin_lock_init(&rbio->bio_list_lock); + INIT_LIST_HEAD(&rbio->stripe_cache); + INIT_LIST_HEAD(&rbio->hash_list); + rbio->bbio = bbio; + rbio->raid_map = raid_map; + rbio->fs_info = root->fs_info; + rbio->stripe_len = stripe_len; + rbio->nr_pages = num_pages; + rbio->faila = -1; + rbio->failb = -1; + atomic_set(&rbio->refs, 1); + + /* + * the stripe_pages and bio_pages array point to the extra + * memory we allocated past the end of the rbio + */ + p = rbio + 1; + rbio->stripe_pages = p; + rbio->bio_pages = p + sizeof(struct page *) * num_pages; + + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) + nr_data = bbio->num_stripes - 2; + else + nr_data = bbio->num_stripes - 1; + + rbio->nr_data = nr_data; + return rbio; +} + +/* allocate pages for all the stripes in the bio, including parity */ +static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + ClearPageUptodate(page); + } + return 0; +} + +/* allocate pages for just the p/q stripes */ +static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) +{ + int i; + struct page *page; + + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + + for (; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) + continue; + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[i] = page; + } + return 0; +} + +/* + * add a single page from a specific stripe into our list of bios for IO + * this will try to merge into existing bios if possible, and returns + * zero if all went well. + */ +int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) +{ + struct bio *last = bio_list->tail; + u64 last_end = 0; + int ret; + struct bio *bio; + struct btrfs_bio_stripe *stripe; + u64 disk_start; + + stripe = &rbio->bbio->stripes[stripe_nr]; + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); + + /* if the device is missing, just fail this stripe */ + if (!stripe->dev->bdev) + return fail_rbio_index(rbio, stripe_nr); + + /* see if we can add this page onto our existing bio */ + if (last) { + last_end = (u64)last->bi_sector << 9; + last_end += last->bi_size; + + /* + * we can't merge these if they are from different + * devices or if they are not contiguous + */ + if (last_end == disk_start && stripe->dev->bdev && + test_bit(BIO_UPTODATE, &last->bi_flags) && + last->bi_bdev == stripe->dev->bdev) { + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); + if (ret == PAGE_CACHE_SIZE) + return 0; + } + } + + /* put a new bio on the list */ + bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); + if (!bio) + return -ENOMEM; + + bio->bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_sector = disk_start >> 9; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + bio_list_add(bio_list, bio); + return 0; +} + +/* + * while we're doing the read/modify/write cycle, we could + * have errors in reading pages off the disk. This checks + * for errors and if we're not able to read the page it'll + * trigger parity reconstruction. The rmw will be finished + * after we've reconstructed the failed stripes + */ +static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) +{ + if (rbio->faila >= 0 || rbio->failb >= 0) { + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + __raid56_parity_recover(rbio); + } else { + finish_rmw(rbio); + } +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) +{ + int index; + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); + index += page; + return rbio->stripe_pages[index]; +} + +/* + * helper function to walk our bio list and populate the bio_pages array with + * the result. This seems expensive, but it is faster than constantly + * searching through the bio list as we setup the IO in finish_rmw or stripe + * reconstruction. + * + * This must be called before you trust the answers from page_in_rbio + */ +static void index_rbio_pages(struct btrfs_raid_bio *rbio) +{ + struct bio *bio; + u64 start; + unsigned long stripe_offset; + unsigned long page_index; + struct page *p; + int i; + + spin_lock_irq(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) { + start = (u64)bio->bi_sector << 9; + stripe_offset = start - rbio->raid_map[0]; + page_index = stripe_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + rbio->bio_pages[page_index + i] = p; + } + } + spin_unlock_irq(&rbio->bio_list_lock); +} + +/* + * this is called from one of two situations. We either + * have a full stripe from the higher layers, or we've read all + * the missing bits off disk. + * + * This will calculate the parity and then send down any + * changed blocks. + */ +static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[bbio->num_stripes]; + int stripe_len = rbio->stripe_len; + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct bio_list bio_list; + struct bio *bio; + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; + int ret; + + bio_list_init(&bio_list); + + if (bbio->num_stripes - rbio->nr_data == 1) { + p_stripe = bbio->num_stripes - 1; + } else if (bbio->num_stripes - rbio->nr_data == 2) { + p_stripe = bbio->num_stripes - 2; + q_stripe = bbio->num_stripes - 1; + } else { + BUG(); + } + + /* at this point we either have a full stripe, + * or we've read the full stripe from the drive. + * recalculate the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->bbio->error, 0); + + /* + * now that we've set rmw_locked, run through the + * bio list one last time and map the page pointers + * + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + index_rbio_pages(rbio); + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *p; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + p = rbio_pstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + p = rbio_qstripe_page(rbio, pagenr); + SetPageUptodate(p); + pointers[stripe++] = kmap(p); + + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + + for (stripe = 0; stripe < bbio->num_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, + page, stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * helper to find the stripe number for a given bio. Used to figure out which + * stripe has failed. This expects the bio to correspond to a physical disk, + * so it looks up based on physical sector numbers. + */ +static int find_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 physical = bio->bi_sector; + u64 stripe_start; + int i; + struct btrfs_bio_stripe *stripe; + + physical <<= 9; + + for (i = 0; i < rbio->bbio->num_stripes; i++) { + stripe = &rbio->bbio->stripes[i]; + stripe_start = stripe->physical; + if (physical >= stripe_start && + physical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * helper to find the stripe number for a given + * bio (before mapping). Used to figure out which stripe has + * failed. This looks up based on logical block numbers. + */ +static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + u64 logical = bio->bi_sector; + u64 stripe_start; + int i; + + logical <<= 9; + + for (i = 0; i < rbio->nr_data; i++) { + stripe_start = rbio->raid_map[i]; + if (logical >= stripe_start && + logical < stripe_start + rbio->stripe_len) { + return i; + } + } + return -1; +} + +/* + * returns -EIO if we had too many failures + */ +static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&rbio->bio_list_lock, flags); + + /* we already know this stripe is bad, move on */ + if (rbio->faila == failed || rbio->failb == failed) + goto out; + + if (rbio->faila == -1) { + /* first failure on this rbio */ + rbio->faila = failed; + atomic_inc(&rbio->bbio->error); + } else if (rbio->failb == -1) { + /* second failure on this rbio */ + rbio->failb = failed; + atomic_inc(&rbio->bbio->error); + } else { + ret = -EIO; + } +out: + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + + return ret; +} + +/* + * helper to fail a stripe based on a physical disk + * bio. + */ +static int fail_bio_stripe(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + int failed = find_bio_stripe(rbio, bio); + + if (failed < 0) + return -EIO; + + return fail_rbio_index(rbio, failed); +} + +/* + * this sets each page in the bio uptodate. It should only be used on private + * rbio pages, nothing that comes in from the higher layers + */ +static void set_bio_pages_uptodate(struct bio *bio) +{ + int i; + struct page *p; + + for (i = 0; i < bio->bi_vcnt; i++) { + p = bio->bi_io_vec[i].bv_page; + SetPageUptodate(p); + } +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid_rmw_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + err = 0; + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + goto cleanup; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_rmw(rbio); + return; + +cleanup: + + rbio_orig_end_io(rbio, -EIO, 0); +} + +static void async_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = rmw_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +static void async_read_rebuild(struct btrfs_raid_bio *rbio) +{ + rbio->work.flags = 0; + rbio->work.func = read_rebuild_work; + + btrfs_queue_worker(&rbio->fs_info->rmw_workers, + &rbio->work); +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->bbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_rmw_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return 0; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; + +finish: + validate_rbio_for_rmw(rbio); + return 0; +} + +/* + * if the upper layers pass in a full stripe, we thank them by only allocating + * enough pages to hold the parity, and sending it all down quickly. + */ +static int full_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = alloc_rbio_parity_pages(rbio); + if (ret) + return ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + finish_rmw(rbio); + return 0; +} + +/* + * partial stripe writes get handed over to async helpers. + * We're really hoping to merge a few more writes into this + * rbio before calculating new parity + */ +static int partial_stripe_write(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = lock_stripe_add(rbio); + if (ret == 0) + async_rmw_stripe(rbio); + return 0; +} + +/* + * sometimes while we were reading from the drive to + * recalculate parity, enough new bios come into create + * a full stripe. So we do a check here to see if we can + * go directly to finish_rmw + */ +static int __raid56_parity_write(struct btrfs_raid_bio *rbio) +{ + /* head off into rmw land if we don't have a full stripe */ + if (!rbio_is_full(rbio)) + return partial_stripe_write(rbio); + return full_stripe_write(rbio); +} + +/* + * We use plugging call backs to collect full stripes. + * Any time we get a partial stripe write while plugged + * we collect it into a list. When the unplug comes down, + * we sort the list by logical block number and merge + * everything we can into the same rbios + */ +struct btrfs_plug_cb { + struct blk_plug_cb cb; + struct btrfs_fs_info *info; + struct list_head rbio_list; + struct btrfs_work work; +}; + +/* + * rbios on the plug list are sorted for easier merging. + */ +static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); + u64 a_sector = ra->bio_list.head->bi_sector; + u64 b_sector = rb->bio_list.head->bi_sector; + + if (a_sector < b_sector) + return -1; + if (a_sector > b_sector) + return 1; + return 0; +} + +static void run_plug(struct btrfs_plug_cb *plug) +{ + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *last = NULL; + + /* + * sort our plug list then try to merge + * everything we can in hopes of creating full + * stripes. + */ + list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { + cur = list_entry(plug->rbio_list.next, + struct btrfs_raid_bio, plug_list); + list_del_init(&cur->plug_list); + + if (rbio_is_full(cur)) { + /* we have a full stripe, send it down */ + full_stripe_write(cur); + continue; + } + if (last) { + if (rbio_can_merge(last, cur)) { + merge_rbio(last, cur); + __free_raid_bio(cur); + continue; + + } + __raid56_parity_write(last); + } + last = cur; + } + if (last) { + __raid56_parity_write(last); + } + kfree(plug); +} + +/* + * if the unplug comes from schedule, we have to push the + * work off to a helper thread + */ +static void unplug_work(struct btrfs_work *work) +{ + struct btrfs_plug_cb *plug; + plug = container_of(work, struct btrfs_plug_cb, work); + run_plug(plug); +} + +static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct btrfs_plug_cb *plug; + plug = container_of(cb, struct btrfs_plug_cb, cb); + + if (from_schedule) { + plug->work.flags = 0; + plug->work.func = unplug_work; + btrfs_queue_worker(&plug->info->rmw_workers, + &plug->work); + return; + } + run_plug(plug); +} + +/* + * our main entry point for writes from the rest of the FS. + */ +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len) +{ + struct btrfs_raid_bio *rbio; + struct btrfs_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + kfree(raid_map); + kfree(bbio); + return PTR_ERR(rbio); + } + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + /* + * don't plug on full rbios, just get them out the door + * as quickly as we can + */ + if (rbio_is_full(rbio)) + return full_stripe_write(rbio); + + cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, + sizeof(*plug)); + if (cb) { + plug = container_of(cb, struct btrfs_plug_cb, cb); + if (!plug->info) { + plug->info = root->fs_info; + INIT_LIST_HEAD(&plug->rbio_list); + } + list_add_tail(&rbio->plug_list, &plug->rbio_list); + } else { + return __raid56_parity_write(rbio); + } + return 0; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int pagenr, stripe; + void **pointers; + int faila = -1, failb = -1; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + int i; + + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + GFP_NOFS); + if (!pointers) { + err = -ENOMEM; + goto cleanup_io; + } + + faila = rbio->faila; + failb = rbio->failb; + + if (rbio->read_rebuild) { + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* setup our array of pointers with pages + * from each stripe + */ + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + pointers[stripe] = kmap(page); + } + + /* all raid6 handling here */ + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == + RAID6_Q_STRIPE) { + + /* + * single failure, rebuild from parity raid5 + * style + */ + if (failb < 0) { + if (faila == rbio->nr_data) { + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * TODO, we should redo the xor here. + */ + err = -EIO; + goto cleanup; + } + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* make sure our ps and qs are in order */ + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + + /* if the q stripe is failed, do a pstripe reconstruction + * from the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want + */ + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + err = -EIO; + goto cleanup; + } + /* + * otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, pointers); + } else { + raid6_2data_recov(rbio->bbio->num_stripes, + PAGE_SIZE, faila, failb, + pointers); + } + } else { + void *p; + + /* rebuild from P stripe here (raid5 or raid6) */ + BUG_ON(failb != -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], + pointers[rbio->nr_data], + PAGE_CACHE_SIZE); + + /* rearrange the pointer array */ + p = pointers[faila]; + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) + pointers[stripe] = pointers[stripe + 1]; + pointers[rbio->nr_data - 1] = p; + + /* xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); + } + /* if we're doing this rebuild as part of an rmw, go through + * and set all of our private rbio pages in the + * failed stripes as uptodate. This way finish_rmw will + * know they can be trusted. If this was a read reconstruction, + * other endio functions will fiddle the uptodate bits + */ + if (!rbio->read_rebuild) { + for (i = 0; i < nr_pages; i++) { + if (faila != -1) { + page = rbio_stripe_page(rbio, faila, i); + SetPageUptodate(page); + } + if (failb != -1) { + page = rbio_stripe_page(rbio, failb, i); + SetPageUptodate(page); + } + } + } + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + /* + * if we're rebuilding a read, we have to use + * pages from the bio list + */ + if (rbio->read_rebuild && + (stripe == faila || stripe == failb)) { + page = page_in_rbio(rbio, stripe, pagenr, 0); + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + kunmap(page); + } + } + + err = 0; +cleanup: + kfree(pointers); + +cleanup_io: + + if (rbio->read_rebuild) { + if (err == 0) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + rbio_orig_end_io(rbio, err, err == 0); + } else if (err == 0) { + rbio->faila = -1; + rbio->failb = -1; + finish_rmw(rbio); + } else { + rbio_orig_end_io(rbio, err, 0); + } +} + +/* + * This is called only for stripes we've read from disk to + * reconstruct the parity. + */ +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + /* + * we only read stripe pages off the disk, set them + * up to date if there were no errors + */ + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + return; + + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + rbio_orig_end_io(rbio, -EIO, 0); + else + __raid_recover_end_io(rbio); +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct btrfs_bio *bbio = rbio->bbio; + struct bio_list bio_list; + int ret; + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int pagenr; + int stripe; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->bbio->error, 0); + + /* + * read everything that hasn't failed. Thanks to the + * stripe cache, it is possible that some or all of these + * pages are going to be uptodate. + */ + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + if (rbio->faila == stripe || + rbio->failb == stripe) + continue; + + for (pagenr = 0; pagenr < nr_pages; pagenr++) { + struct page *p; + + /* + * the rmw code may have already read this + * page in + */ + p = rbio_stripe_page(rbio, stripe, pagenr); + if (PageUptodate(p)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, + rbio_stripe_page(rbio, stripe, pagenr), + stripe, pagenr, rbio->stripe_len); + if (ret < 0) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * we might have no bios to read just because the pages + * were up to date, or we might have no bios to read because + * the devices were gone. + */ + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + __raid_recover_end_io(rbio); + goto out; + } else { + goto cleanup; + } + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&bbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_recover_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } +out: + return 0; + +cleanup: + if (rbio->read_rebuild) + rbio_orig_end_io(rbio, -EIO, 0); + return -EIO; +} + +/* + * the main entry point for reads from the higher layers. This + * is really only called when the normal read path had a failure, + * so we assume the bio they send down corresponds to a failed part + * of the drive. + */ +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + if (IS_ERR(rbio)) { + return PTR_ERR(rbio); + } + + rbio->read_rebuild = 1; + bio_list_add(&rbio->bio_list, bio); + rbio->bio_list_bytes = bio->bi_size; + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return -EIO; + } + + /* + * reconstruct from the q stripe if they are + * asking for mirror 3 + */ + if (mirror_num == 3) + rbio->failb = bbio->num_stripes - 2; + + ret = lock_stripe_add(rbio); + + /* + * __raid56_parity_recover will end the bio with + * any errors it hits. We don't want to return + * its error value up the stack because our caller + * will end up calling bio_endio with any nonzero + * return + */ + if (ret == 0) + __raid56_parity_recover(rbio); + /* + * our rbio has been added to the list of + * rbios that will be handled after the + * currently lock owner is done + */ + return 0; + +} + +static void rmw_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_rmw_stripe(rbio); +} + +static void read_rebuild_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 Fusion-io All rights reserved. + * Copyright (C) 2012 Intel Corp. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_RAID56__ +#define __BTRFS_RAID56__ +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} +#define RAID5_P_STRIPE ((u64)-2) +#define RAID6_Q_STRIPE ((u64)-1) + +#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ + ((x) == RAID6_Q_STRIPE)) + +int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len, int mirror_num); +int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 *raid_map, + u64 stripe_len); + +int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); +void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +#endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 17c306bf177a..50695dc5e2ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode, } } - page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 67783e03d121..53c3501fa4ca 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "check-integrity.h" #include "rcu-string.h" +#include "raid56.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_device *extent_dev; int extent_mirror_num; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + if (num >= nr_data_stripes(map)) { + return 0; + } + } + nstripes = length; offset = 0; do_div(nstripes, map->stripe_len); @@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_root *root = sctx->dev_root; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; gen = root->fs_info->last_trans_committed; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f4ab7a9260eb..f7a8b861058b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -85,6 +85,7 @@ struct send_ctx { u32 send_max_size; u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ struct vfsmount *mnt; @@ -3709,6 +3710,39 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " return ret; } +/* + * Send an update extent command to user space. + */ +static int send_update_extent(struct send_ctx *sctx, + u64 offset, u32 len) +{ + int ret = 0; + struct fs_path *p; + + p = fs_path_alloc(sctx); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); + if (ret < 0) + goto out; + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(sctx, p); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx, goto out; } - if (!clone_root) { + if (clone_root) { + ret = send_clone(sctx, offset, len, clone_root); + } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = send_update_extent(sctx, offset, len); + } else { while (pos < len) { l = len - pos; if (l > BTRFS_SEND_READ_SIZE) @@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx, pos += ret; } ret = 0; - } else { - ret = send_clone(sctx, offset, len, clone_root); } - out: return ret; } @@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) struct btrfs_fs_info *fs_info; struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; - struct file *filp = NULL; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; @@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { + ret = -EINVAL; + goto out; + } + sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); if (!sctx) { ret = -ENOMEM; @@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); INIT_LIST_HEAD(&sctx->name_cache_list); + sctx->flags = arg->flags; + sctx->send_filp = fget(arg->send_fd); if (IS_ERR(sctx->send_filp)) { ret = PTR_ERR(sctx->send_filp); @@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; out: - if (filp) - fput(filp); kfree(arg); vfree(clone_sources_tmp); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32fd4ef..8bb18f7ccaa6 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -86,6 +86,7 @@ enum btrfs_send_cmd { BTRFS_SEND_C_UTIMES, BTRFS_SEND_C_END, + BTRFS_SEND_C_UPDATE_EXTENT, __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9601d3..68a29a1ea068 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -41,13 +41,13 @@ #include #include #include +#include #include "compat.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "ioctl.h" #include "print-tree.h" #include "xattr.h" #include "volumes.h" @@ -63,8 +63,7 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, - char nbuf[16]) +static const char *btrfs_decode_error(int errno, char nbuf[16]) { char *errstr = NULL; @@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) * today we only save the error info into ram. Long term we'll * also send it down to the disk */ - fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } static void save_error_info(struct btrfs_fs_info *fs_info) @@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (sb->s_flags & MS_RDONLY) return; - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); /* @@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, struct super_block *sb = fs_info->sb; char nbuf[16]; const char *errstr; - va_list args; - va_start(args, fmt); /* * Special case: if the error is EROFS, and we're already @@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; - errstr = btrfs_decode_error(fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); if (fmt) { - struct va_format vaf = { - .fmt = fmt, - .va = &args, - }; + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", sb->s_id, function, line, errstr, &vaf); + va_end(args); } else { printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); @@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, save_error_info(fs_info); btrfs_handle_error(fs_info); } - va_end(args); } static const char * const logtypes[] = { @@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, char nbuf[16]; const char *errstr; - errstr = btrfs_decode_error(root->fs_info, errno, nbuf); + errstr = btrfs_decode_error(errno, nbuf); btrfs_printk(root->fs_info, "%s:%d: Aborting unused transaction(%s).\n", function, line, errstr); @@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(fs_info, errno, nbuf); - if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) + errstr = btrfs_decode_error(errno, nbuf); + if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", s_id, function, line, &vaf, errstr); @@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_compress_force: case Opt_compress_force_type: compress_force = true; + /* Fallthrough */ case Opt_compress: case Opt_compress_type: if (token == Opt_compress || @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_alloc_start: num = match_strdup(&args[0]); if (num) { + mutex_lock(&info->chunk_mutex); info->alloc_start = memparse(num, NULL); + mutex_unlock(&info->chunk_mutex); kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", @@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0); - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (flags & MS_RDONLY))) { + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + if (flags & MS_RDONLY) + sync_filesystem(fs_info->sb); + } +} + +static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, + unsigned long old_opts) +{ + /* + * We need cleanup all defragable inodes if the autodefragment is + * close or the fs is R/O. + */ + if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || + (fs_info->sb->s_flags & MS_RDONLY))) { + btrfs_cleanup_defrag_inodes(fs_info); + } + + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + static int btrfs_remount(struct super_block *sb, int *flags, char *data) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; + btrfs_remount_prepare(fs_info, old_opts, *flags); + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) fs_info->thread_pool_size, old_thread_pool_size); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - return 0; + goto out; if (*flags & MS_RDONLY) { /* @@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } sb->s_flags &= ~MS_RDONLY; } - +out: + btrfs_remount_cleanup(fs_info, old_opts); return 0; restore: @@ -1289,10 +1326,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; fs_info->max_inline = old_max_inline; + mutex_lock(&fs_info->chunk_mutex); fs_info->alloc_start = old_alloc_start; + mutex_unlock(&fs_info->chunk_mutex); btrfs_resize_thread_pool(fs_info, old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; + btrfs_remount_cleanup(fs_info, old_opts); return ret; } @@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; - trans = btrfs_attach_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ if (PTR_ERR(trans) == -ENOENT) @@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_inode; - err = btrfs_interface_init(); + err = btrfs_delayed_ref_init(); if (err) goto free_auto_defrag; + err = btrfs_interface_init(); + if (err) + goto free_delayed_ref; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_delayed_ref: + btrfs_delayed_ref_exit(); free_auto_defrag: btrfs_auto_defrag_exit(); free_delayed_inode: @@ -1720,6 +1766,7 @@ static int __init init_btrfs_fs(void) static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae6d731..5b326cd60a4a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include "ctree.h" diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4c0067c4f76d..e52da6fb1165 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(transaction->delayed_refs.root.rb_node); - memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) root->commit_root = btrfs_root_node(root); } +static inline int can_join_transaction(struct btrfs_transaction *trans, + int type) +{ + return !(trans->in_commit && + type != TRANS_JOIN && + type != TRANS_JOIN_NOLOCK); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -86,6 +93,10 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } + if (!can_join_transaction(cur_trans, type)) { + spin_unlock(&fs_info->trans_lock); + return -EBUSY; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -113,7 +124,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); goto loop; - } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { spin_unlock(&fs_info->trans_lock); kmem_cache_free(btrfs_transaction_cachep, cur_trans); return -EROFS; @@ -155,8 +166,12 @@ static noinline int join_transaction(struct btrfs_root *root, int type) spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); + atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); + atomic_set(&cur_trans->delayed_refs.ref_seq, 0); + init_waitqueue_head(&cur_trans->delayed_refs.wait); INIT_LIST_HEAD(&cur_trans->pending_snapshots); + INIT_LIST_HEAD(&cur_trans->ordered_operations); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, int ret; u64 qgroup_reserved = 0; - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -359,8 +374,11 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, do { ret = join_transaction(root, type); - if (ret == -EBUSY) + if (ret == -EBUSY) { wait_current_trans(root); + if (unlikely(type == TRANS_ATTACH)) + ret = -ENOENT; + } } while (ret == -EBUSY); if (ret < 0) { @@ -382,9 +400,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, h->block_rsv = NULL; h->orig_rsv = NULL; h->aborted = 0; - h->qgroup_reserved = qgroup_reserved; + h->qgroup_reserved = 0; h->delayed_ref_elem.seq = 0; h->type = type; + h->allocating_chunk = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); @@ -400,6 +419,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } + h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root return start_transaction(root, 0, TRANS_USERSPACE, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is used when we want to commit the current the transaction, but + * don't want to start a new one. + * + * Note: If this function return -ENOENT, it just means there is no + * running transaction. But it is possible that the inactive transaction + * is still in the memory, not fully on disk. If you hope there is no + * inactive transaction in the fs when -ENOENT is returned, you should + * invoke + * btrfs_attach_transaction_barrier() + */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, 0); } +/* + * btrfs_attach_transaction() - catch the running transaction + * + * It is similar to the above function, the differentia is this one + * will wait for all the inactive transactions until they fully + * complete. + */ +struct btrfs_trans_handle * +btrfs_attach_transaction_barrier(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + + trans = start_transaction(root, 0, TRANS_ATTACH, 0); + if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) + btrfs_wait_for_commit(root, 0); + + return trans; +} + /* wait for a transaction commit to be fully complete */ static noinline void wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) @@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - while (count < 2) { + while (count < 1) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (cur && @@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } count++; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_run_delayed_iputs(root); if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) err = -EIO; - } assert_qgroups_uptodate(trans); - memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); return err; } @@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct blk_plug plug; + blk_start_plug(&plug); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, @@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, } if (err) werr = err; + blk_finish_plug(&plug); return werr; } @@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, } /* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. + * defrag a given btree. + * Every leaf in the btree is read and defragged. */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +int btrfs_defrag_root(struct btrfs_root *root) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_trans_handle *trans; @@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_defrag_leaves(trans, root, cacheonly); + ret = btrfs_defrag_leaves(trans, root); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root); @@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; + + if (btrfs_defrag_cancelled(root->fs_info)) { + printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); + ret = -EAGAIN; + break; + } } root->defrag_running = 0; return ret; @@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct inode *parent_inode; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = pending->error = -ENOMEM; - goto path_alloc_fail; + return ret; } new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); @@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; + trans->bytes_reserved = trans->block_rsv->reserved; dentry = pending->dentry; - parent = dget_parent(dentry); - parent_inode = parent->d_inode; + parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, root, ret); fail: - dput(parent); trans->block_rsv = rsv; + trans->bytes_reserved = 0; no_free_objectid: kfree(new_root_item); root_item_alloc_fail: btrfs_free_path(path); -path_alloc_fail: - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); return ret; } @@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, struct btrfs_async_commit { struct btrfs_trans_handle *newtrans; struct btrfs_root *root; - struct delayed_work work; + struct work_struct work; }; static void do_async_commit(struct work_struct *work) { struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work.work); + container_of(work, struct btrfs_async_commit, work); /* * We've got freeze protection passed with the transaction. @@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, if (!ac) return -ENOMEM; - INIT_DELAYED_WORK(&ac->work, do_async_commit); + INIT_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { @@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1, _THIS_IP_); - schedule_delayed_work(&ac->work, 0); + schedule_work(&ac->work); /* wait for transaction to start and unblock */ if (wait_for_unblock) @@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; + DEFINE_WAIT(wait); WARN_ON(trans->use_count > 1); @@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; } spin_unlock(&root->fs_info->trans_lock); @@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, } if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) + return ret; btrfs_wait_ordered_extents(root, 1); } @@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(trans, root, 1); - return 0; + return ret; } /* @@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int should_grow = 0; unsigned long now = get_seconds(); - ret = btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - goto cleanup_transaction; + btrfs_end_transaction(trans, root); + return ret; } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } cur_trans = trans->transaction; @@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_create_pending_block_groups(trans, root); ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { @@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cleanup_transaction: btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + if (trans->qgroup_reserved) { + btrfs_qgroup_free(root, trans->qgroup_reserved); + trans->qgroup_reserved = 0; + } btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); // WARN_ON(1); if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c287..3c8e0d25c8e4 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -43,6 +43,7 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; + struct list_head ordered_operations; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -68,6 +69,7 @@ struct btrfs_trans_handle { struct btrfs_block_rsv *orig_rsv; short aborted; short adding_csums; + bool allocating_chunk; enum btrfs_trans_type type; /* * this root is only needed to validate that the root passed to @@ -82,11 +84,13 @@ struct btrfs_trans_handle { struct btrfs_pending_snapshot { struct dentry *dentry; + struct inode *dir; struct btrfs_root *root; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; + u64 qgroup_reserved; /* extra metadata reseration for relocation */ int error; bool readonly; @@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_attach_transaction_barrier( + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee8ab1d..94e05c1f118a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -23,13 +23,14 @@ #include "transaction.h" #include "locking.h" -/* defrag all the leaves in a given btree. If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to +/* + * Defrag all the leaves in a given btree. + * Read all the leaves and try to get key order to * better reflect disk order */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only) + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, u64 last_ret = 0; u64 min_trans = 0; - if (cache_only) - goto out; - if (root->fs_info->extent_root == root) { /* * there's recursion here right now in the tree locking, @@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, } path->keep_locks = 1; - if (cache_only) - min_trans = root->defrag_trans_start; - ret = btrfs_search_forward(root, &key, NULL, path, - cache_only, min_trans); + ret = btrfs_search_forward(root, &key, NULL, path, min_trans); if (ret < 0) goto out; if (ret > 0) { @@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + next_key_ret = btrfs_find_next_key(root, path, &key, 1, min_trans); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, - cache_only, &last_ret, + &last_ret, &root->defrag_progress); if (ret) { WARN_ON(ret == -EAGAIN); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1e7466..c7ef569eb22a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent_for_log_replay(wc->trans, - log->fs_info->extent_root, + btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen, 0)) { @@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key *key) { int found_type; - u64 mask = root->sectorsize - 1; u64 extent_end; u64 start = key->offset; u64 saved_nbytes; @@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; + extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; goto out; @@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, unsigned long log_transid = 0; mutex_lock(&root->log_mutex); + log_transid = root->log_transid; index1 = root->log_transid % 2; if (atomic_read(&root->log_commit[index1])) { wait_log_commit(trans, root, root->log_transid); @@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { ret = -EAGAIN; + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } - log_transid = root->log_transid; if (log_transid % 2 == 0) mark = EXTENT_DIRTY; else @@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); goto out; } @@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } root->fs_info->last_trans_log_full_commit = trans->transid; btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out; @@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = 0; goto out; @@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); if (ret) { btrfs_abort_transaction(trans, root, ret); + btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_logged_extents(log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + if (trans) { + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + } while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } + /* + * We may have short-circuited the log tree with the full commit logic + * and left ordered extents on our list, so clear these out to keep us + * from leaking inodes and memory. + */ + btrfs_free_logged_extents(log, 0); + btrfs_free_logged_extents(log, 1); + free_extent_buffer(log->node); kfree(log); } @@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); /* * we didn't find anything from this transaction, see if there @@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct btrfs_ordered_extent *ordered; struct list_head ordered_sums; struct btrfs_map_token token; struct btrfs_key key; - u64 csum_offset = em->mod_start - em->start; - u64 csum_len = em->mod_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; + u64 csum_offset; + u64 csum_len; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; + int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; +insert: INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); @@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + /* + * If we are overwriting an inline extent with a real one then we need + * to just delete the inline extent as it may not be large enough to + * have the entire file_extent_item. + */ + if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == + BTRFS_FILE_EXTENT_INLINE) { + ret = btrfs_del_item(trans, log, path); + btrfs_release_path(path); + if (ret) { + path->really_keep_locks = 0; + return ret; + } + goto insert; + } + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans, csum_len = block_len; } + /* + * First check and see if our csums are on our outstanding ordered + * extents. + */ +again: + spin_lock_irq(&log->log_extents_lock[index]); + list_for_each_entry(ordered, &log->logged_list[index], log_list) { + struct btrfs_ordered_sum *sum; + + if (!mod_len) + break; + + if (ordered->inode != inode) + continue; + + if (ordered->file_offset + ordered->len <= mod_start || + mod_start + mod_len <= ordered->file_offset) + continue; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this + * ordered extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered->file_offset + ordered->len >= + mod_start + mod_len) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered->file_offset + ordered->len < + mod_start + mod_len) { + mod_len = (mod_start + mod_len) - + (ordered->file_offset + ordered->len); + mod_start = ordered->file_offset + + ordered->len; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, + &ordered->flags)) + continue; + atomic_inc(&ordered->refs); + spin_unlock_irq(&log->log_extents_lock[index]); + /* + * we've dropped the lock, we must either break or + * start over after this. + */ + + wait_event(ordered->wait, ordered->csum_bytes_left == 0); + + list_for_each_entry(sum, &ordered->list, list) { + ret = btrfs_csum_file_blocks(trans, log, sum); + if (ret) { + btrfs_put_ordered_extent(ordered); + goto unlocked; + } + } + btrfs_put_ordered_extent(ordered); + goto again; + + } + spin_unlock_irq(&log->log_extents_lock[index]); +unlocked: + + if (!mod_len || ret) + return ret; + + csum_offset = mod_start - em->start; + csum_len = mod_len; + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; u64 test_gen; int ret = 0; + int num = 0; INIT_LIST_HEAD(&extents); @@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); + + /* + * Just an arbitrary number, this can be really CPU intensive + * once we start getting a lot of extents, and really once we + * have a bunch of extents we just want to commit since it will + * be faster. + */ + if (++num > 32768) { + list_del_init(&tree->modified_extents); + ret = -EFBIG; + goto process; + } + if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ atomic_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); + num++; } list_sort(NULL, &extents, extent_cmp); +process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); + btrfs_get_logged_extents(log, inode); + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, while (1) { ins_nr = 0; ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); + path, trans->transid); if (ret != 0) break; again: @@ -3656,6 +3797,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: + if (err) + btrfs_free_logged_extents(log, log->log_transid); mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); @@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - WARN_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; ret = 1; } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c138db6..ddc61cad0080 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,7 +5,7 @@ */ #include -#include +#include #include "ulist.h" /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cbb7f4b1672..35bb2d4ed29f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "compat.h" #include "ctree.h" #include "extent_map.h" @@ -32,6 +34,7 @@ #include "transaction.h" #include "print-tree.h" #include "volumes.h" +#include "raid56.h" #include "async-thread.h" #include "check-integrity.h" #include "rcu-string.h" @@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->writeable = 0; new_device->in_fs_metadata = 0; new_device->can_discard = 0; + spin_lock_init(&new_device->io_lock); list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +/* + * Look for a btrfs signature on a device. This may be called out of the mount path + * and we are not allowed to call set_blocksize during the scan. The superblock + * is read via pagecache + */ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret) { struct btrfs_super_block *disk_super; struct block_device *bdev; - struct buffer_head *bh; - int ret; + struct page *page; + void *p; + int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; + u64 bytenr; + pgoff_t index; + /* + * we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + bytenr = btrfs_sb_offset(0); flags |= FMODE_EXCL; mutex_lock(&uuid_mutex); - ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); - if (ret) + + bdev = blkdev_get_by_path(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); goto error; - disk_super = (struct btrfs_super_block *)bh->b_data; + } + + /* make sure our super fits in the device */ + if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) + goto error_bdev_put; + + /* make sure our super fits in the page */ + if (sizeof(*disk_super) > PAGE_CACHE_SIZE) + goto error_bdev_put; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_CACHE_SHIFT; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) + goto error_bdev_put; + + /* pull in the page with our super */ + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_NOFS); + + if (IS_ERR_OR_NULL(page)) + goto error_bdev_put; + + p = kmap(page); + + /* align our pointer to the offset of the super block */ + disk_super = p + (bytenr & ~PAGE_CACHE_MASK); + + if (btrfs_super_bytenr(disk_super) != bytenr || + disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) + goto error_unmap; + devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); + if (disk_super->label[0]) { if (disk_super->label[BTRFS_LABEL_SIZE - 1]) disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; @@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, } else { printk(KERN_INFO "device fsid %pU ", disk_super->fsid); } + printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; - brelse(bh); + +error_unmap: + kunmap(page); + page_cache_release(page); + +error_bdev_put: blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); @@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) u64 devid; u64 num_devices; u8 *dev_uuid; + unsigned seq; int ret = 0; bool clear_super = false; mutex_lock(&uuid_mutex); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; + do { + seq = read_seqbegin(&root->fs_info->profiles_lock); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&root->fs_info->profiles_lock, seq)); num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace); @@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid5\n"); + ret = -EINVAL; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + printk(KERN_ERR "btrfs: unable to go below three " + "devices on raid6\n"); + ret = -EINVAL; + goto out; + } + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, chunk_used = btrfs_block_group_used(&cache->item); if (bargs->usage == 0) - user_thresh = 0; + user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; else @@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, return 0; if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - factor = num_stripes / factor; + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { + factor = num_stripes / 2; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { + factor = num_stripes - 1; + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { + factor = num_stripes - 2; + } else { + factor = num_stripes; + } for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, int mixed = 0; int ret; u64 num_devices; + unsigned seq; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); else allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10); + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6); if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && (!alloc_profile_is_valid(bctl->data.target, 1) || @@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6; + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_system_alloc_bits & allowed) && + !(bctl->sys.target & allowed)) || + ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && + (fs_info->avail_metadata_alloc_bits & allowed) && + !(bctl->meta.target & allowed))) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + printk(KERN_INFO "btrfs: force reducing metadata " + "integrity\n"); + } else { + printk(KERN_ERR "btrfs: balance will reduce metadata " + "integrity, use force if you want this\n"); + ret = -EINVAL; + goto out; + } } - } + } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { int num_tolerated_disk_barrier_failures; @@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, mutex_lock(&fs_info->balance_mutex); atomic_dec(&fs_info->balance_running); - if (bargs) { - memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); - } - - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { - __cancel_balance(fs_info); - } - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); } + if (bargs) { + memset(bargs, 0, sizeof(*bargs)); + update_ioctl_balance_args(fs_info, 0, bargs); + } + wake_up(&fs_info->balance_wait_q); return ret; @@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) } struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - { 2, 1, 0, 4, 2, 2 /* raid10 */ }, - { 1, 1, 2, 2, 2, 2 /* raid1 */ }, - { 1, 2, 1, 1, 1, 2 /* dup */ }, - { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 1, 1, 1, 1 /* single */ }, + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .devs_increment = 1, + .ncopies = 3, + }, }; +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO allow them to set a preferred stripe size */ + return 64 * 1024; +} + +static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) +{ + u64 features; + + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + return; + + features = btrfs_super_incompat_flags(info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_RAID56) + return; + + features |= BTRFS_FEATURE_INCOMPAT_RAID56; + btrfs_set_super_incompat_flags(info->super_copy, features); + printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); +} + static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, struct map_lookup **map_ret, @@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_device_info *devices_info = NULL; u64 total_avail; int num_stripes; /* total number of stripes to allocate */ + int data_stripes; /* number of stripes that count for + block group size */ int sub_stripes; /* sub_stripes info for map */ int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ @@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_chunk_size; u64 stripe_size; u64 num_bytes; + u64 raid_stripe_len = BTRFS_STRIPE_LEN; int ndevs; int i; int j; @@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) continue; + if (ndevs == fs_devices->rw_devices) { + WARN(1, "%s: found more than %llu devices\n", + __func__, fs_devices->rw_devices); + break; + } devices_info[ndevs].dev_offset = dev_offset; devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; ++ndevs; - WARN_ON(ndevs > fs_devices->rw_devices); } /* @@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * ndevs > max_chunk_size * ncopies) { - stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, ndevs); + /* + * this will have to be fixed for RAID1 and RAID10 over + * more drives + */ + data_stripes = num_stripes / ncopies; + + if (type & BTRFS_BLOCK_GROUP_RAID5) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 1; + } + if (type & BTRFS_BLOCK_GROUP_RAID6) { + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, + btrfs_super_stripesize(info->super_copy)); + data_stripes = num_stripes - 2; + } + + /* + * Use the number of data stripes to figure out how big this chunk + * is really going to be in terms of logical address space, + * and compare that answer with the max chunk size + */ + if (stripe_size * data_stripes > max_chunk_size) { + u64 mask = (1ULL << 24) - 1; + stripe_size = max_chunk_size; + do_div(stripe_size, data_stripes); + + /* bump the answer up to a 16MB boundary */ + stripe_size = (stripe_size + mask) & ~mask; + + /* but don't go higher than the limits we found + * while searching for free extents + */ + if (stripe_size > devices_info[ndevs-1].max_avail) + stripe_size = devices_info[ndevs-1].max_avail; } do_div(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, BTRFS_STRIPE_LEN); - stripe_size *= BTRFS_STRIPE_LEN; + do_div(stripe_size, raid_stripe_len); + stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { @@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } } map->sector_size = extent_root->sectorsize; - map->stripe_len = BTRFS_STRIPE_LEN; - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; + map->stripe_len = raid_stripe_len; + map->io_align = raid_stripe_len; + map->io_width = raid_stripe_len; map->type = type; map->sub_stripes = sub_stripes; *map_ret = map; - num_bytes = stripe_size * (num_stripes / ncopies); + num_bytes = stripe_size * data_stripes; *stripe_size_out = stripe_size; *num_bytes_out = num_bytes; @@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - free_extent_map(em); - if (ret) - goto error; - - ret = btrfs_make_block_group(trans, extent_root, 0, type, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, num_bytes); - if (ret) + if (ret) { + free_extent_map(em); goto error; + } for (i = 0; i < map->num_stripes; ++i) { struct btrfs_device *device; @@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, info->chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, dev_offset, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - goto error; - } + if (ret) + goto error_dev_extent; } + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, num_bytes); + if (ret) { + i = map->num_stripes - 1; + goto error_dev_extent; + } + + free_extent_map(em); + check_raid56_incompat_flag(extent_root->fs_info, type); + kfree(devices_info); return 0; +error_dev_extent: + for (; i >= 0; i--) { + struct btrfs_device *device; + int err; + + device = map->stripes[i].dev; + err = btrfs_free_dev_extent(trans, device, start); + if (err) { + btrfs_abort_transaction(trans, extent_root, err); + break; + } + } + write_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + + /* One for our allocation */ + free_extent_map(em); + /* One for the tree reference */ + free_extent_map(em); error: kfree(map); kfree(devices_info); @@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, if (ret) return ret; - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(extent_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, &stripe_size, chunk_offset, alloc_profile); if (ret) @@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, sys_chunk_offset = chunk_offset + chunk_size; - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - + alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, &sys_chunk_size, &sys_stripe_size, sys_chunk_offset, alloc_profile); @@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); @@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + unsigned long len = root->sectorsize; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + len = map->stripe_len * nr_data_stripes(map); + } + free_extent_map(em); + return len; +} + +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + read_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) + ret = 1; + free_extent_map(em); + return ret; +} + static int find_live_mirror(struct btrfs_fs_info *fs_info, struct map_lookup *map, int first, int num, int optimal, int dev_replace_is_ongoing) @@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return optimal; } +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num) + int mirror_num, u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; @@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr; u64 stripe_nr_orig; u64 stripe_nr_end; + u64 stripe_len; + u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, int num_alloc_stripes; int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; + u64 raid56_full_stripe_start = (u64)-1; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); @@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, map = (struct map_lookup *)em->bdev; offset = logical - em->start; + if (mirror_num > map->num_stripes) + mirror_num = 0; + + stripe_len = map->stripe_len; stripe_nr = offset; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, map->stripe_len); + do_div(stripe_nr, stripe_len); - stripe_offset = stripe_nr * map->stripe_len; + stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (rw & REQ_DISCARD) + /* if we're here for raid56, we need to know the stripe aligned start */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + raid56_full_stripe_start = offset; + + /* allow a write of a full stripe, but make sure we don't + * allow straddling of stripes + */ + do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + } + + if (rw & REQ_DISCARD) { + /* we don't discard raid56 yet */ + if (map->type & + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + ret = -EOPNOTSUPP; + goto out; + } *length = min_t(u64, em->len - offset, *length); - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len; + /* For writes to RAID[56], allow a full stripeset across all disks. + For other RAID types and for RAID[56] reads, just allow a single + stripe (on a single disk). */ + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + (rw & REQ_WRITE)) { + max_len = stripe_len * nr_data_stripes(map) - + (offset - raid56_full_stripe_start); + } else { + /* we limit the length of each bio to what fits in a stripe */ + max_len = stripe_len - stripe_offset; + } + *length = min_t(u64, em->len - offset, max_len); } else { *length = em->len - offset; } + /* This is for when we're called from btrfs_merge_bio_hook() and all + it cares about is the length */ if (!bbio_ret) goto out; @@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0); + logical, &tmp_length, &tmp_bbio, 0, NULL); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = 1; stripe_index = 0; stripe_nr_orig = stripe_nr; - stripe_nr_end = (offset + *length + map->stripe_len - 1) & - (~(map->stripe_len - 1)); + stripe_nr_end = ALIGN(offset + *length, map->stripe_len); do_div(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, @@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, dev_replace_is_ongoing); mirror_num = stripe_index - old_stripe_index + 1; } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) + && raid_map_ret) { + int i, rot; + + /* push stripe_nr back to the start of the full stripe */ + stripe_nr = raid56_full_stripe_start; + do_div(stripe_nr, stripe_len); + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; + max_errors = nr_parity_stripes(map); + + raid_map = kmalloc(sizeof(u64) * num_stripes, + GFP_NOFS); + if (!raid_map) { + ret = -ENOMEM; + goto out; + } + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_DUP)) { max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; } } @@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } + if (raid_map) { + sort_parity_stripes(bbio, raid_map); + *raid_map_ret = raid_map; + } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num); + mirror_num, NULL); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; read_lock(&em_tree->lock); @@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ @@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; @@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err) bio->bi_bdev = (struct block_device *) (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is - * beyond the tolerance of the multi-bio + * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; @@ -4668,13 +5079,18 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline void schedule_bio(struct btrfs_root *root, +noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; + if (device->missing || !device->bdev) { + bio_endio(bio, -EIO); + return; + } + /* don't bother with additional async steps for reads, right now */ if (!(rw & REQ_WRITE)) { bio_get(bio); @@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, #endif bio->bi_bdev = dev->bdev; if (async) - schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, rw, bio); else btrfsic_submit_bio(rw, bio); } @@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; + u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, length = bio->bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num); - if (ret) + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, + mirror_num, &raid_map); + if (ret) /* -ENOMEM */ return ret; total_devs = bbio->num_stripes; + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); + + if (raid_map) { + /* In this case, map_length has been set to the length of + a single stripe; not the whole write */ + if (rw & WRITE) { + return raid56_parity_write(root, bio, bbio, + raid_map, map_length); + } else { + return raid56_parity_recover(root, bio, bbio, + raid_map, map_length, + mirror_num); + } + } + if (map_length < length) { printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); - while (dev_nr < total_devs) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac751..062d8604d35b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -21,8 +21,8 @@ #include #include +#include #include "async-thread.h" -#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); - +void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio); +int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, + u64 logical, u64 len, int mirror_num); +unsigned long btrfs_full_stripe_len(struct btrfs_root *root, + struct btrfs_mapping_tree *map_tree, + u64 logical); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { diff --git a/include/linux/btrfs.h b/include/linux/btrfs.h new file mode 100644 index 000000000000..22d799147db2 --- /dev/null +++ b/include/linux/btrfs.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_BTRFS_H +#define _LINUX_BTRFS_H + +#include + +#endif /* _LINUX_BTRFS_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 4e67194fd2c3..5c8a1d25e21c 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -68,6 +68,7 @@ header-y += blkpg.h header-y += blktrace_api.h header-y += bpqether.h header-y += bsg.h +header-y += btrfs.h header-y += can.h header-y += capability.h header-y += capi.h diff --git a/fs/btrfs/ioctl.h b/include/uapi/linux/btrfs.h similarity index 96% rename from fs/btrfs/ioctl.h rename to include/uapi/linux/btrfs.h index dabca9cc8c2e..fa3a5f9338fc 100644 --- a/fs/btrfs/ioctl.h +++ b/include/uapi/linux/btrfs.h @@ -16,8 +16,9 @@ * Boston, MA 021110-1307, USA. */ -#ifndef __IOCTL_ -#define __IOCTL_ +#ifndef _UAPI_LINUX_BTRFS_H +#define _UAPI_LINUX_BTRFS_H +#include #include #define BTRFS_IOCTL_MAGIC 0x94 @@ -406,6 +407,13 @@ struct btrfs_ioctl_received_subvol_args { __u64 reserved[16]; /* in */ }; +/* + * Caller doesn't want file data in the send stream, even if the + * search of clone sources doesn't find an extent. UPDATE_EXTENT + * commands will be sent instead of WRITE commands. + */ +#define BTRFS_SEND_FLAG_NO_FILE_DATA 0x1 + struct btrfs_ioctl_send_args { __s64 send_fd; /* in */ __u64 clone_sources_count; /* in */ @@ -494,9 +502,13 @@ struct btrfs_ioctl_send_args { struct btrfs_ioctl_qgroup_create_args) #define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ struct btrfs_ioctl_qgroup_limit_args) +#define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \ + char[BTRFS_LABEL_SIZE]) +#define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \ + char[BTRFS_LABEL_SIZE]) #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ struct btrfs_ioctl_get_dev_stats) #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ struct btrfs_ioctl_dev_replace_args) -#endif +#endif /* _UAPI_LINUX_BTRFS_H */