linux-stable/fs/btrfs
Filipe Manana c125b8bff1 Btrfs: ensure readers see new data after a clone operation
We were cleaning the clone target file range from the page cache before
we did replace the file extent items in the fs tree. This was racy,
as right after cleaning the relevant range from the page cache and before
replacing the file extent items, a read against that range could be
performed by another task and populate again the page cache with stale
data (stale after the cloning finishes). This would result in reads after
the clone operation successfully finishes to get old data (and potentially
for a very long time). Therefore evict the pages after replacing the file
extent items, so that subsequent reads will always get the new data.

Similarly, we were prone to races while cloning the file extent items
because we weren't locking the target range and wait for any existing
ordered extents against that range to complete. It was possible that
after cloning the extent items, a write operation that was performed
before the clone operation and overlaps the same range, would end up
undoing all or part of the work the clone operation did (a worker task
running inode.c:btrfs_finish_ordered_io). Therefore lock the target
range in the io tree, wait for all pending ordered extents against that
range to finish and then safely perform the cloning.

The issue of reading stale data after the clone operation is easy to
reproduce by running the following C program in a loop until it exits
with return value 1.

 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <pthread.h>
 #include <fcntl.h>
 #include <assert.h>
 #include <asm/types.h>
 #include <linux/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/ioctl.h>

 #define SRC_FILE "/mnt/sdd/foo"
 #define DST_FILE "/mnt/sdd/bar"
 #define FILE_SIZE (16 * 1024)
 #define PATTERN_SRC 'X'
 #define PATTERN_DST 'Y'

struct btrfs_ioctl_clone_range_args {
	__s64 src_fd;
	__u64 src_offset, src_length;
	__u64 dest_offset;
};

 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
				   struct btrfs_ioctl_clone_range_args)

static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static int clone_done = 0;
static int reader_ready = 0;
static int stale_data = 0;

static void *reader_loop(void *arg)
{
	char buf[4096], want_buf[4096];

	memset(want_buf, PATTERN_SRC, 4096);
	pthread_mutex_lock(&mutex);
	reader_ready = 1;
	pthread_mutex_unlock(&mutex);

	while (1) {
		int done, fd, ret;

		fd = open(DST_FILE, O_RDONLY);
		assert(fd != -1);

		pthread_mutex_lock(&mutex);
		done = clone_done;
		pthread_mutex_unlock(&mutex);

		ret = read(fd, buf, 4096);
		assert(ret == 4096);
		close(fd);

		if (done) {
			ret = memcmp(buf, want_buf, 4096);
			if (ret == 0) {
				printf("Found new content\n");
			} else {
				printf("Found old content\n");
				pthread_mutex_lock(&mutex);
				stale_data = 1;
				pthread_mutex_unlock(&mutex);
			}
			break;
		}
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	pthread_t reader;
	int ret, i, fd;
	struct btrfs_ioctl_clone_range_args clone_args;
	int fd1, fd2;

	ret = remove(SRC_FILE);
	if (ret == -1 && errno != ENOENT) {
		fprintf(stderr, "Error deleting src file: %s\n", strerror(errno));
		return 1;
	}
	ret = remove(DST_FILE);
	if (ret == -1 && errno != ENOENT) {
		fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno));
		return 1;
	}

	fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
	assert(fd != -1);
	for (i = 0; i < FILE_SIZE; i++) {
		char c = PATTERN_SRC;
		ret = write(fd, &c, 1);
		assert(ret == 1);
	}
	close(fd);
	fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU);
	assert(fd != -1);
	for (i = 0; i < FILE_SIZE; i++) {
		char c = PATTERN_DST;
		ret = write(fd, &c, 1);
		assert(ret == 1);
	}
	close(fd);
        sync();

	ret = pthread_create(&reader, NULL, reader_loop, NULL);
	assert(ret == 0);
	while (1) {
		int r;
		pthread_mutex_lock(&mutex);
		r = reader_ready;
		pthread_mutex_unlock(&mutex);
		if (r) break;
	}

	fd1 = open(SRC_FILE, O_RDONLY);
	if (fd1 < 0) {
		fprintf(stderr, "Error open src file: %s\n", strerror(errno));
		return 1;
	}
	fd2 = open(DST_FILE, O_RDWR);
	if (fd2 < 0) {
		fprintf(stderr, "Error open dst file: %s\n", strerror(errno));
		return 1;
	}
	clone_args.src_fd = fd1;
	clone_args.src_offset = 0;
	clone_args.src_length = 4096;
	clone_args.dest_offset = 0;
	ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args);
	assert(ret == 0);
	close(fd1);
	close(fd2);

	pthread_mutex_lock(&mutex);
	clone_done = 1;
	pthread_mutex_unlock(&mutex);
	ret = pthread_join(reader, NULL);
	assert(ret == 0);

	pthread_mutex_lock(&mutex);
	ret = stale_data ? 1 : 0;
	pthread_mutex_unlock(&mutex);
	return ret;
}

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-09 17:21:02 -07:00
..
tests Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
acl.c btrfs: remove useless ACL check 2014-06-09 17:20:42 -07:00
async-thread.c btrfs: fix crash in remount(thread_pool=) case 2014-04-07 10:41:52 -07:00
async-thread.h btrfs: Add trace for btrfs_workqueue alloc/destroy 2014-03-20 17:15:28 -07:00
backref.c Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
backref.h Btrfs: rework qgroup accounting 2014-06-09 17:20:48 -07:00
btrfs_inode.h btrfs: Drop EXTENT_UPTODATE check in hole punching and direct locking 2014-06-09 17:20:57 -07:00
check-integrity.c btrfs: check_int: propagate out-of-memory error upwards 2014-06-09 17:20:21 -07:00
check-integrity.h block: submit_bio_wait() conversions 2013-11-24 16:33:41 -07:00
compression.c btrfs: return ptr error from compression workspace 2014-06-09 17:20:22 -07:00
compression.h btrfs: make static code static & remove dead code 2013-05-06 15:55:23 -04:00
ctree.c Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
ctree.h btrfs: allocate raid type kobjects dynamically 2014-06-09 17:21:01 -07:00
delayed-inode.c btrfs: Cleanup the "_struct" suffix in btrfs_workequeue 2014-03-10 15:17:16 -04:00
delayed-inode.h Btrfs: introduce the delayed inode ref deletion for the single link inode 2014-01-28 13:20:09 -08:00
delayed-ref.c Btrfs: rework qgroup accounting 2014-06-09 17:20:48 -07:00
delayed-ref.h Btrfs: rework qgroup accounting 2014-06-09 17:20:48 -07:00
dev-replace.c Btrfs: don't flush all delalloc inodes when we doesn't get s_umount lock 2014-03-10 15:17:27 -04:00
dev-replace.h
dir-item.c Btrfs: convert printk to btrfs_ and fix BTRFS prefix 2014-01-28 13:20:05 -08:00
disk-io.c Btrfs: async delayed refs 2014-06-09 17:20:58 -07:00
disk-io.h Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
export.c btrfs: remove fs/btrfs/compat.h 2013-11-11 22:03:19 -05:00
export.h
extent-tree.c btrfs: allocate raid type kobjects dynamically 2014-06-09 17:21:01 -07:00
extent_io.c Btrfs: split up __extent_writepage to lower stack usage 2014-06-09 17:20:58 -07:00
extent_io.h Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
extent_map.c Btrfs: more efficient btrfs_drop_extent_cache 2014-03-10 15:16:57 -04:00
extent_map.h Btrfs: more efficient btrfs_drop_extent_cache 2014-03-10 15:16:57 -04:00
file-item.c Btrfs: don't access non-existent key when csum tree is empty 2014-06-09 17:20:44 -07:00
file.c btrfs: Drop EXTENT_UPTODATE check in hole punching and direct locking 2014-06-09 17:20:57 -07:00
free-space-cache.c Btrfs: break up __btrfs_write_out_cache to cut down stack usage 2014-06-09 17:20:55 -07:00
free-space-cache.h Btrfs: remove path arg from btrfs_truncate_free_space_cache 2013-11-11 21:51:33 -05:00
hash.c Btrfs: fix btrfs boot when compiled as built-in 2014-01-28 13:20:31 -08:00
hash.h Btrfs: fix btrfs boot when compiled as built-in 2014-01-28 13:20:31 -08:00
inode-item.c btrfs: cleanup: removed unused 'btrfs_get_inode_ref_index' 2014-01-28 13:19:39 -08:00
inode-map.c btrfs: remove newline from inode cache kthread name 2014-06-09 17:20:53 -07:00
inode-map.h
inode.c Btrfs: async delayed refs 2014-06-09 17:20:58 -07:00
ioctl.c Btrfs: ensure readers see new data after a clone operation 2014-06-09 17:21:02 -07:00
Kconfig Btrfs: fix btrfs boot when compiled as built-in 2014-01-28 13:20:31 -08:00
locking.c btrfs: make static code static & remove dead code 2013-05-06 15:55:23 -04:00
locking.h
lzo.c btrfs: return errno instead of -1 from compression 2014-06-09 17:20:21 -07:00
Makefile Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
math.h
ordered-data.c btrfs: remove stale newlines from log messages 2014-06-09 17:20:53 -07:00
ordered-data.h btrfs: Cleanup the "_struct" suffix in btrfs_workequeue 2014-03-10 15:17:16 -04:00
orphan.c btrfs: expand btrfs_find_item() to include find_orphan_item functionality 2014-01-28 13:19:37 -08:00
print-tree.c Btrfs: don't use ram_bytes for uncompressed inline items 2014-01-29 07:06:29 -08:00
print-tree.h btrfs: make static code static & remove dead code 2013-05-06 15:55:23 -04:00
props.c Btrfs: add support for inode properties 2014-01-28 13:20:24 -08:00
props.h Btrfs: add support for inode properties 2014-01-28 13:20:24 -08:00
qgroup.c Btrfs: free tmp ulist for qgroup rescan 2014-06-09 17:20:55 -07:00
qgroup.h Btrfs: rework qgroup accounting 2014-06-09 17:20:48 -07:00
raid56.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs 2014-04-04 15:31:36 -07:00
raid56.h
rcu-string.h
reada.c btrfs: Cleanup the "_struct" suffix in btrfs_workequeue 2014-03-10 15:17:16 -04:00
relocation.c btrfs: remove stale newlines from log messages 2014-06-09 17:20:53 -07:00
root-tree.c Btrfs: use bitfield instead of integer data type for the some variants in btrfs_root 2014-06-09 17:20:40 -07:00
scrub.c btrfs: Remove unnecessary check for NULL 2014-06-09 17:20:23 -07:00
send.c Btrfs: send, use the right limits for xattr names and values 2014-06-09 17:21:00 -07:00
send.h btrfs: make static code static & remove dead code 2013-05-06 15:55:23 -04:00
struct-funcs.c
super.c btrfs: remove stale newlines from log messages 2014-06-09 17:20:53 -07:00
sysfs.c btrfs: allocate raid type kobjects dynamically 2014-06-09 17:21:01 -07:00
sysfs.h btrfs: add simple debugfs interface 2014-03-10 15:15:51 -04:00
transaction.c Btrfs: async delayed refs 2014-06-09 17:20:58 -07:00
transaction.h Btrfs: add sanity tests for new qgroup accounting code 2014-06-09 17:20:49 -07:00
tree-defrag.c Btrfs: use bitfield instead of integer data type for the some variants in btrfs_root 2014-06-09 17:20:40 -07:00
tree-log.c Btrfs: use helpers for last_trans_log_full_commit instead of opencode 2014-06-09 17:20:45 -07:00
tree-log.h Btrfs: use helpers for last_trans_log_full_commit instead of opencode 2014-06-09 17:20:45 -07:00
ulist.c Btrfs: do not export ulist functions 2014-01-29 07:06:27 -08:00
ulist.h Btrfs: do not export ulist functions 2014-01-29 07:06:27 -08:00
uuid-tree.c Btrfs: convert printk to btrfs_ and fix BTRFS prefix 2014-01-28 13:20:05 -08:00
volumes.c fs: btrfs: volumes.c: Fix for possible null pointer dereference 2014-06-09 17:21:01 -07:00
volumes.h btrfs: balance filter: add limit of processed chunks 2014-06-09 17:20:26 -07:00
xattr.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs 2014-01-30 20:08:20 -08:00
xattr.h btrfs: use generic posix ACL infrastructure 2014-01-25 23:58:18 -05:00
zlib.c btrfs: return errno instead of -1 from compression 2014-06-09 17:20:21 -07:00