From 879c5e6b7cb4c689d08ca9b2e353d8ab3dc425d5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 17 Jun 2009 11:47:48 -0400
Subject: [PATCH 01/15] jbd2: convert instrumentation from markers to
 tracepoints

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c        |   5 +-
 fs/jbd2/commit.c            |  13 +--
 fs/jbd2/journal.c           |  69 +++++++++++++++
 include/linux/jbd2.h        |   6 ++
 include/trace/events/jbd2.h | 168 ++++++++++++++++++++++++++++++++++++
 5 files changed, 252 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/events/jbd2.h

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9e..5d70b3e6d49b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
-	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
-		   journal->j_devname, result);
+	trace_jbd2_checkpoint(journal, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226fd..7b4088b2364d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -26,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 		 * block allocation  with delalloc. We need to write
 		 * only allocated blocks here.
 		 */
+		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 		err = journal_submit_inode_data_buffers(mapping);
 		if (!ret)
 			ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
-	trace_mark(jbd2_start_commit, "dev %s transaction %d",
-		   journal->j_devname, commit_transaction->t_tid);
+	trace_jbd2_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	if (commit_transaction->t_synchronous_commit)
 		write_op = WRITE_SYNC_PLUG;
+	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.u.run.rs_flushing = jiffies;
 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 					       stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_COMMIT;
 	spin_unlock(&journal->j_state_lock);
 
+	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.u.run.rs_logging = jiffies;
 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 						 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
-	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
-		   journal->j_tail_sequence);
+	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 	if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 62be7d294ec2..18bfd5dab642 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
+#include <linux/hash.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -2377,6 +2381,71 @@ static void __exit journal_exit(void)
 	jbd2_journal_destroy_caches();
 }
 
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+	struct rcu_head	rcu;
+	dev_t		device;
+	char		devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+
+static void free_devcache(struct rcu_head *rcu)
+{
+	kfree(rcu);
+}
+
+const char *jbd2_dev_to_name(dev_t device)
+{
+	int	i = hash_32(device, CACHE_SIZE_BITS);
+	char	*ret;
+	struct block_device *bd;
+
+	rcu_read_lock();
+	if (devcache[i] && devcache[i]->device == device) {
+		ret = devcache[i]->devname;
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
+
+	spin_lock(&devname_cache_lock);
+	if (devcache[i]) {
+		if (devcache[i]->device == device) {
+			ret = devcache[i]->devname;
+			spin_unlock(&devname_cache_lock);
+			return ret;
+		}
+		call_rcu(&devcache[i]->rcu, free_devcache);
+	}
+	devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+	if (!devcache[i]) {
+		spin_unlock(&devname_cache_lock);
+		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+	}
+	devcache[i]->device = device;
+	bd = bdget(device);
+	if (bd) {
+		bdevname(bd, devcache[i]->devname);
+		bdput(bd);
+	} else
+		__bdevname(device, devcache[i]->devname);
+	ret = devcache[i]->devname;
+	spin_unlock(&devname_cache_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
+
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index cc02393bfce8..d97eb652d6ca 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1315,6 +1315,12 @@ extern int jbd_blocks_per_page(struct inode *inode);
 #define BUFFER_TRACE2(bh, bh2, info)	do {} while (0)
 #define JBUFFER_TRACE(jh, info)	do {} while (0)
 
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ */
+extern const char *jbd2_dev_to_name(dev_t device);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_JBD2_H */
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
new file mode 100644
index 000000000000..845b0b4b48fd
--- /dev/null
+++ b/include/trace/events/jbd2.h
@@ -0,0 +1,168 @@
+#if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_JBD2_H
+
+#include <linux/jbd2.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM jbd2
+
+TRACE_EVENT(jbd2_checkpoint,
+
+	TP_PROTO(journal_t *journal, int result),
+
+	TP_ARGS(journal, result),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	result			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->result		= result;
+	),
+
+	TP_printk("dev %s result %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->result)
+);
+
+TRACE_EVENT(jbd2_start_commit,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_locking,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_flushing,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_commit_logging,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+
+	TP_printk("dev %s transaction %d sync %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit)
+);
+
+TRACE_EVENT(jbd2_end_commit,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	char,	sync_commit		  )
+		__field(	int,	transaction		  )
+		__field(	int,	head		  	  )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->sync_commit = commit_transaction->t_synchronous_commit;
+		__entry->transaction	= commit_transaction->t_tid;
+		__entry->head		= journal->j_tail_sequence;
+	),
+
+	TP_printk("dev %s transaction %d sync %d head %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->transaction,
+		  __entry->sync_commit, __entry->head)
+);
+
+TRACE_EVENT(jbd2_submit_inode_data,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+	),
+
+	TP_printk("dev %s ino %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino)
+);
+
+#endif /* _TRACE_JBD2_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From 9bffad1ed2a003a355ed1b42424a0ae3575275ed Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 17 Jun 2009 11:48:11 -0400
Subject: [PATCH 02/15] ext4: convert instrumentation from markers to
 tracepoints

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c             |   8 +-
 fs/ext4/ialloc.c            |  15 +-
 fs/ext4/inode.c             |  69 +---
 fs/ext4/mballoc.c           |  77 ++--
 fs/ext4/mballoc.h           |   1 -
 fs/ext4/super.c             |   6 +-
 include/trace/events/ext4.h | 719 ++++++++++++++++++++++++++++++++++++
 7 files changed, 774 insertions(+), 121 deletions(-)
 create mode 100644 include/trace/events/ext4.h

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
+#include <trace/events/ext4.h>
+
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
-	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
-		   inode->i_sb->s_id, datasync, inode->i_ino,
-		   dentry->d_parent->d_inode->i_ino);
+	trace_ext4_sync_file(file, dentry, datasync);
 
 	/*
 	 * data=writeback:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3743bd849bce..7d502f3be914 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,14 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
+#include <trace/events/ext4.h>
+
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
  */
@@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	ino = inode->i_ino;
 	ext4_debug("freeing inode %lu\n", ino);
-	trace_mark(ext4_free_inode,
-		   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
-		   sb->s_id, inode->i_ino, inode->i_mode,
-		   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
-		   (unsigned long long) inode->i_blocks);
+	trace_ext4_free_inode(inode);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -815,8 +814,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	sb = dir->i_sb;
 	ngroups = ext4_get_groups_count(sb);
-	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
-		   dir->i_ino, mode);
+	trace_ext4_request_inode(dir, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -1047,8 +1045,7 @@ got:
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
-	trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
-		   sb->s_id, inode->i_ino, dir->i_ino, mode);
+	trace_ext4_allocate_inode(inode, dir, mode);
 	goto really_out;
 fail:
 	ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 875db944b22f..2418ad36eab5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
 
+#include <trace/events/ext4.h>
+
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -1466,10 +1469,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	pgoff_t index;
 	unsigned from, to;
 
-	trace_mark(ext4_write_begin,
-		   "dev %s ino %lu pos %llu len %u flags %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, flags);
+	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason
@@ -1611,10 +1611,7 @@ static int ext4_ordered_write_end(struct file *file,
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
-	trace_mark(ext4_ordered_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_ordered_write_end(inode, pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
@@ -1658,10 +1655,7 @@ static int ext4_writeback_write_end(struct file *file,
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
-	trace_mark(ext4_writeback_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_writeback_write_end(inode, pos, len, copied);
 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
@@ -1705,10 +1699,7 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	loff_t new_i_size;
 
-	trace_mark(ext4_journalled_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -2554,9 +2545,7 @@ static int ext4_da_writepage(struct page *page,
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
-	trace_mark(ext4_da_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_da_writepage(inode, page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2667,19 +2656,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
-	trace_mark(ext4_da_writepages,
-		   "dev %s ino %lu nr_t_write %ld "
-		   "pages_skipped %ld range_start %llu "
-		   "range_end %llu nonblocking %d "
-		   "for_kupdate %d for_reclaim %d "
-		   "for_writepages %d range_cyclic %d",
-		   inode->i_sb->s_id, inode->i_ino,
-		   wbc->nr_to_write, wbc->pages_skipped,
-		   (unsigned long long) wbc->range_start,
-		   (unsigned long long) wbc->range_end,
-		   wbc->nonblocking, wbc->for_kupdate,
-		   wbc->for_reclaim, wbc->for_writepages,
-		   wbc->range_cyclic);
+	trace_ext4_da_writepages(inode, wbc);
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2845,14 +2822,7 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
-	trace_mark(ext4_da_writepage_result,
-		   "dev %s ino %lu ret %d pages_written %d "
-		   "pages_skipped %ld congestion %d "
-		   "more_io %d no_nrwrite_index_update %d",
-		   inode->i_sb->s_id, inode->i_ino, ret,
-		   pages_written, wbc->pages_skipped,
-		   wbc->encountered_congestion, wbc->more_io,
-		   wbc->no_nrwrite_index_update);
+	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
 
@@ -2904,11 +2874,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
-
-	trace_mark(ext4_da_write_begin,
-		   "dev %s ino %lu pos %llu len %u flags %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, flags);
+	trace_ext4_da_write_begin(inode, pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -3001,10 +2967,7 @@ static int ext4_da_write_end(struct file *file,
 		}
 	}
 
-	trace_mark(ext4_da_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -3255,9 +3218,7 @@ static int ext4_normal_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
-	trace_mark(ext4_normal_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_normal_writepage(inode, page);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -3343,9 +3304,7 @@ static int ext4_journalled_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
-	trace_mark(ext4_journalled_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_journalled_writepage(inode, page);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ed8482e22c0e..8d98070b48fb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
  */
 
 #include "mballoc.h"
+#include <trace/events/ext4.h>
+
 /*
  * MUSTDO:
  *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
-
-
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
 #if BITS_PER_LONG == 64
@@ -2859,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
 			+ entry->start_blk
 			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
-			   sb->s_id, (unsigned long long) discard_block,
-			   entry->count);
+		trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
+					  entry->count);
 		sb_issue_discard(sb, discard_block, entry->count);
 
 		kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -3629,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-	trace_mark(ext4_mb_new_inode_pa,
-		   "dev %s ino %lu pstart %llu len %u lstart %u",
-		   sb->s_id, ac->ac_inode->i_ino,
-		   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_inode_pa(ac, pa);
 
 	ext4_mb_use_inode_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3691,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_type = MB_GROUP_PA;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-	trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
-		   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_group_pa(ac, pa);
 
 	ext4_mb_use_group_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3783,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			ext4_mb_store_history(ac);
 		}
 
-		trace_mark(ext4_mb_release_inode_pa,
-			   "dev %s ino %lu block %llu count %u",
-			   sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
-			   next - bit);
+		trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
+					       next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3820,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	if (ac)
 		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
-	trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
-		   sb->s_id, pa->pa_pstart, pa->pa_len);
+	trace_ext4_mb_release_group_pa(ac, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
 	INIT_LIST_HEAD(&list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac)
+		ac->ac_sb = sb;
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3987,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode)
 	}
 
 	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
-	trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
-		   inode->i_ino);
+	trace_ext4_discard_preallocations(inode);
 
 	INIT_LIST_HEAD(&list);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = inode;
+	}
 repeat:
 	/* first, collect all pa's in the inode */
 	spin_lock(&ei->i_prealloc_lock);
@@ -4276,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
 	INIT_LIST_HEAD(&discard_list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac)
+		ac->ac_sb = sb;
 
 	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4445,8 +4444,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 	int ret;
 	int freed = 0;
 
-	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
-		   sb->s_id, needed);
+	trace_ext4_mb_discard_preallocations(sb, needed);
 	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
@@ -4475,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
-		   "lblk %llu goal %llu lleft %llu lright %llu "
-		   "pleft %llu pright %llu ",
-		   sb->s_id, ar->flags, ar->len,
-		   ar->inode ? ar->inode->i_ino : 0,
-		   (unsigned long long) ar->logical,
-		   (unsigned long long) ar->goal,
-		   (unsigned long long) ar->lleft,
-		   (unsigned long long) ar->lright,
-		   (unsigned long long) ar->pleft,
-		   (unsigned long long) ar->pright);
+	trace_ext4_request_blocks(ar);
 
 	/*
 	 * For delayed allocation, we could skip the ENOSPC and
@@ -4521,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (!ac) {
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = ar->inode;
+	} else {
 		ar->len = 0;
 		*errp = -ENOMEM;
 		goto out1;
@@ -4594,18 +4585,7 @@ out3:
 						reserv_blks);
 	}
 
-	trace_mark(ext4_allocate_blocks,
-		   "dev %s block %llu flags %u len %u ino %lu "
-		   "logical %llu goal %llu lleft %llu lright %llu "
-		   "pleft %llu pright %llu ",
-		   sb->s_id, (unsigned long long) block,
-		   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
-		   (unsigned long long) ar->logical,
-		   (unsigned long long) ar->goal,
-		   (unsigned long long) ar->lleft,
-		   (unsigned long long) ar->lright,
-		   (unsigned long long) ar->pleft,
-		   (unsigned long long) ar->pright);
+	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
 
 	return block;
 }
@@ -4740,10 +4720,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	ext4_debug("freeing block %lu\n", block);
-	trace_mark(ext4_free_blocks,
-		   "dev %s block %llu count %lu metadata %d ino %lu",
-		   sb->s_id, (unsigned long long) block, count, metadata,
-		   inode ? inode->i_ino : 0);
+	trace_ext4_free_blocks(inode, block, count, metadata);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 75e34f69215b..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/version.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397e..e8f0b2af4607 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,7 +37,6 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/ctype.h>
-#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -47,6 +46,9 @@
 #include "xattr.h"
 #include "acl.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
 static int default_mb_history_length = 1000;
 
 module_param_named(default_mb_history_length, default_mb_history_length,
@@ -3346,7 +3348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	int ret = 0;
 	tid_t target;
 
-	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
+	trace_ext4_sync_fs(sb, wait);
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
 			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
new file mode 100644
index 000000000000..acf4cc9cd36d
--- /dev/null
+++ b/include/trace/events/ext4.h
@@ -0,0 +1,719 @@
+#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EXT4_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ext4
+
+#include <linux/writeback.h>
+#include "../../../fs/ext4/ext4.h"
+#include "../../../fs/ext4/mballoc.h"
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ext4_free_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	uid_t,	uid			)
+		__field(	gid_t,	gid			)
+		__field(	blkcnt_t, blocks		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->uid	= inode->i_uid;
+		__entry->gid	= inode->i_gid;
+		__entry->blocks	= inode->i_blocks;
+	),
+
+	TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode,
+		  __entry->uid, __entry->gid, __entry->blocks)
+);
+
+TRACE_EVENT(ext4_request_inode,
+	TP_PROTO(struct inode *dir, int mode),
+
+	TP_ARGS(dir, mode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	dir			)
+		__field(	umode_t, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->dir	= dir->i_ino;
+		__entry->mode	= mode;
+	),
+
+	TP_printk("dev %s dir %lu mode %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->dir, __entry->mode)
+);
+
+TRACE_EVENT(ext4_allocate_inode,
+	TP_PROTO(struct inode *inode, struct inode *dir, int mode),
+
+	TP_ARGS(inode, dir, mode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	dir			)
+		__field(	umode_t, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->dir	= dir->i_ino;
+		__entry->mode	= mode;
+	),
+
+	TP_printk("dev %s ino %lu dir %lu mode %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->dir, __entry->mode)
+);
+
+TRACE_EVENT(ext4_write_begin,
+
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int flags),
+
+	TP_ARGS(inode, pos, len, flags),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u flags %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->flags)
+);
+
+TRACE_EVENT(ext4_ordered_write_end,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+			unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, copied		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->copied	= copied;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->copied)
+);
+
+TRACE_EVENT(ext4_writeback_write_end,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, copied		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->copied	= copied;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->copied)
+);
+
+TRACE_EVENT(ext4_journalled_write_end,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+		 unsigned int copied),
+	TP_ARGS(inode, pos, len, copied),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, copied		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->copied	= copied;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->copied)
+);
+
+TRACE_EVENT(ext4_da_writepage,
+	TP_PROTO(struct inode *inode, struct page *page),
+
+	TP_ARGS(inode, page),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	pgoff_t, index			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->index	= page->index;
+	),
+
+	TP_printk("dev %s ino %lu page_index %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
+);
+
+TRACE_EVENT(ext4_da_writepages,
+	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
+
+	TP_ARGS(inode, wbc),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	long,	nr_to_write		)
+		__field(	long,	pages_skipped		)
+		__field(	loff_t,	range_start		)
+		__field(	loff_t,	range_end		)
+		__field(	char,	nonblocking		)
+		__field(	char,	for_kupdate		)
+		__field(	char,	for_reclaim		)
+		__field(	char,	for_writepages		)
+		__field(	char,	range_cyclic		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->range_start	= wbc->range_start;
+		__entry->range_end	= wbc->range_end;
+		__entry->nonblocking	= wbc->nonblocking;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->for_writepages	= wbc->for_writepages;
+		__entry->range_cyclic	= wbc->range_cyclic;
+	),
+
+	TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
+		  __entry->pages_skipped, __entry->range_start,
+		  __entry->range_end, __entry->nonblocking,
+		  __entry->for_kupdate, __entry->for_reclaim,
+		  __entry->for_writepages, __entry->range_cyclic)
+);
+
+TRACE_EVENT(ext4_da_writepages_result,
+	TP_PROTO(struct inode *inode, struct writeback_control *wbc,
+			int ret, int pages_written),
+
+	TP_ARGS(inode, wbc, ret, pages_written),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	int,	ret			)
+		__field(	int,	pages_written		)
+		__field(	long,	pages_skipped		)
+		__field(	char,	encountered_congestion	)
+		__field(	char,	more_io			)	
+		__field(	char,	no_nrwrite_index_update )
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->ret		= ret;
+		__entry->pages_written	= pages_written;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->encountered_congestion	= wbc->encountered_congestion;
+		__entry->more_io	= wbc->more_io;
+		__entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+	),
+
+	TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret,
+		  __entry->pages_written, __entry->pages_skipped,
+		  __entry->encountered_congestion, __entry->more_io,
+		  __entry->no_nrwrite_index_update)
+);
+
+TRACE_EVENT(ext4_da_write_begin,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+			unsigned int flags),
+
+	TP_ARGS(inode, pos, len, flags),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u flags %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->flags)
+);
+
+TRACE_EVENT(ext4_da_write_end,
+	TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
+			unsigned int copied),
+
+	TP_ARGS(inode, pos, len, copied),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	pos			)
+		__field(	unsigned int, len		)
+		__field(	unsigned int, copied		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->pos	= pos;
+		__entry->len	= len;
+		__entry->copied	= copied;
+	),
+
+	TP_printk("dev %s ino %lu pos %llu len %u copied %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len,
+		  __entry->copied)
+);
+
+TRACE_EVENT(ext4_normal_writepage,
+	TP_PROTO(struct inode *inode, struct page *page),
+
+	TP_ARGS(inode, page),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	pgoff_t, index			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->index	= page->index;
+	),
+
+	TP_printk("dev %s ino %lu page_index %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
+);
+
+TRACE_EVENT(ext4_journalled_writepage,
+	TP_PROTO(struct inode *inode, struct page *page),
+
+	TP_ARGS(inode, page),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	pgoff_t, index			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->index	= page->index;
+	),
+
+	TP_printk("dev %s ino %lu page_index %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
+);
+
+TRACE_EVENT(ext4_discard_blocks,
+	TP_PROTO(struct super_block *sb, unsigned long long blk,
+			unsigned long long count),
+
+	TP_ARGS(sb, blk, count),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	__u64,	blk			)
+		__field(	__u64,	count			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->blk	= blk;
+		__entry->count	= count;
+	),
+
+	TP_printk("dev %s blk %llu count %llu",
+		  jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count)
+);
+
+TRACE_EVENT(ext4_mb_new_inode_pa,
+	TP_PROTO(struct ext4_allocation_context *ac,
+		 struct ext4_prealloc_space *pa),
+
+	TP_ARGS(ac, pa),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	pa_pstart		)
+		__field(	__u32,	pa_len			)
+		__field(	__u64,	pa_lstart		)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ac->ac_sb->s_dev;
+		__entry->ino		= ac->ac_inode->i_ino;
+		__entry->pa_pstart	= pa->pa_pstart;
+		__entry->pa_len		= pa->pa_len;
+		__entry->pa_lstart	= pa->pa_lstart;
+	),
+
+	TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart,
+		  __entry->pa_len, __entry->pa_lstart)
+);
+
+TRACE_EVENT(ext4_mb_new_group_pa,
+	TP_PROTO(struct ext4_allocation_context *ac,
+		 struct ext4_prealloc_space *pa),
+
+	TP_ARGS(ac, pa),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	pa_pstart		)
+		__field(	__u32,	pa_len			)
+		__field(	__u64,	pa_lstart		)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ac->ac_sb->s_dev;
+		__entry->ino		= ac->ac_inode->i_ino;
+		__entry->pa_pstart	= pa->pa_pstart;
+		__entry->pa_len		= pa->pa_len;
+		__entry->pa_lstart	= pa->pa_lstart;
+	),
+
+	TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart,
+		  __entry->pa_len, __entry->pa_lstart)
+);
+
+TRACE_EVENT(ext4_mb_release_inode_pa,
+	TP_PROTO(struct ext4_allocation_context *ac,
+		 struct ext4_prealloc_space *pa,
+		 unsigned long long block, unsigned int count),
+
+	TP_ARGS(ac, pa, block, count),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	block			)
+		__field(	__u32,	count			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ac->ac_sb->s_dev;
+		__entry->ino		= ac->ac_inode->i_ino;
+		__entry->block		= block;
+		__entry->count		= count;
+	),
+
+	TP_printk("dev %s ino %lu block %llu count %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block,
+		  __entry->count)
+);
+
+TRACE_EVENT(ext4_mb_release_group_pa,
+	TP_PROTO(struct ext4_allocation_context *ac,
+		 struct ext4_prealloc_space *pa),
+
+	TP_ARGS(ac, pa),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	pa_pstart		)
+		__field(	__u32,	pa_len			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev		= ac->ac_sb->s_dev;
+		__entry->ino		= ac->ac_inode->i_ino;
+		__entry->pa_pstart	= pa->pa_pstart;
+		__entry->pa_len		= pa->pa_len;
+	),
+
+	TP_printk("dev %s pstart %llu len %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len)
+);
+
+TRACE_EVENT(ext4_discard_preallocations,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+	),
+
+	TP_printk("dev %s ino %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino)
+);
+
+TRACE_EVENT(ext4_mb_discard_preallocations,
+	TP_PROTO(struct super_block *sb, int needed),
+
+	TP_ARGS(sb, needed),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	needed			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->needed	= needed;
+	),
+
+	TP_printk("dev %s needed %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->needed)
+);
+
+TRACE_EVENT(ext4_request_blocks,
+	TP_PROTO(struct ext4_allocation_request *ar),
+
+	TP_ARGS(ar),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	unsigned int, flags		)
+		__field(	unsigned int, len		)
+		__field(	__u64,  logical			)
+		__field(	__u64,	goal			)
+		__field(	__u64,	lleft			)
+		__field(	__u64,	lright			)
+		__field(	__u64,	pleft			)
+		__field(	__u64,	pright			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= ar->inode->i_sb->s_dev;
+		__entry->ino	= ar->inode->i_ino;
+		__entry->flags	= ar->flags;
+		__entry->len	= ar->len;
+		__entry->logical = ar->logical;
+		__entry->goal	= ar->goal;
+		__entry->lleft	= ar->lleft;
+		__entry->lright	= ar->lright;
+		__entry->pleft	= ar->pleft;
+		__entry->pright	= ar->pright;
+	),
+
+	TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags,
+		  __entry->len,
+		  (unsigned long long) __entry->logical,
+		  (unsigned long long) __entry->goal,
+		  (unsigned long long) __entry->lleft,
+		  (unsigned long long) __entry->lright,
+		  (unsigned long long) __entry->pleft,
+		  (unsigned long long) __entry->pright)
+);
+
+TRACE_EVENT(ext4_allocate_blocks,
+	TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),
+
+	TP_ARGS(ar, block),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	block			)
+		__field(	unsigned int, flags		)
+		__field(	unsigned int, len		)
+		__field(	__u64,  logical			)
+		__field(	__u64,	goal			)
+		__field(	__u64,	lleft			)
+		__field(	__u64,	lright			)
+		__field(	__u64,	pleft			)
+		__field(	__u64,	pright			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= ar->inode->i_sb->s_dev;
+		__entry->ino	= ar->inode->i_ino;
+		__entry->block	= block;
+		__entry->flags	= ar->flags;
+		__entry->len	= ar->len;
+		__entry->logical = ar->logical;
+		__entry->goal	= ar->goal;
+		__entry->lleft	= ar->lleft;
+		__entry->lright	= ar->lright;
+		__entry->pleft	= ar->pleft;
+		__entry->pright	= ar->pright;
+	),
+
+	TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags,
+		  __entry->len, __entry->block,
+		  (unsigned long long) __entry->logical,
+		  (unsigned long long) __entry->goal,
+		  (unsigned long long) __entry->lleft,
+		  (unsigned long long) __entry->lright,
+		  (unsigned long long) __entry->pleft,
+		  (unsigned long long) __entry->pright)
+);
+
+TRACE_EVENT(ext4_free_blocks,
+	TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
+			int metadata),
+
+	TP_ARGS(inode, block, count, metadata),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	__u64,	block			)
+		__field(	unsigned long,	count		)
+		__field(	int,	metadata		)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->block		= block;
+		__entry->count		= count;
+		__entry->metadata	= metadata;
+	),
+
+	TP_printk("dev %s ino %lu block %llu count %lu metadata %d",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block,
+		  __entry->count, __entry->metadata)
+);
+
+TRACE_EVENT(ext4_sync_file,
+	TP_PROTO(struct file *file, struct dentry *dentry, int datasync),
+
+	TP_ARGS(file, dentry, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+	),
+
+	TP_printk("dev %s ino %ld parent %ld datasync %d ",
+		  jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->parent,
+		  __entry->datasync)
+);
+
+TRACE_EVENT(ext4_sync_fs,
+	TP_PROTO(struct super_block *sb, int wait),
+
+	TP_ARGS(sb, wait),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	int,	wait			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->wait	= wait;
+	),
+
+	TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev),
+		  __entry->wait)
+);
+
+#endif /* _TRACE_EXT4_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From 96159f25112595386c56e09eca90284e85e7ecbf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 27 Apr 2009 17:24:22 -0400
Subject: [PATCH 03/15] ext3: avoid unnecessary spinlock in critical POSIX ACL
 path

If a filesystem supports POSIX ACL's, the VFS layer expects the filesystem
to do POSIX ACL checks on any files not owned by the caller, and it does
this for every single pathname component that it looks up.

That obviously can be pretty expensive if the filesystem isn't careful
about it, especially with locking. That's doubly sad, since the common
case tends to be that there are no ACL's associated with the files in
question.

ext3 already caches the ACL data so that it doesn't have to look it up
over and over again, but it does so by taking the inode->i_lock spinlock
on every lookup. Which is a noticeable overhead even if it's a private
lock, especially on CPU's where the serialization is expensive (eg Intel
Netburst aka 'P4').

For the special case of not actually having any ACL's, all that locking is
unnecessary. Even if somebody else were to be changing the ACL's on
another CPU, we simply don't care - if we've seen a NULL ACL, we might as
well use it.

So just load the ACL speculatively without any locking, and if it was
NULL, just use it. If it's non-NULL (either because we had a cached
entry, or because the cache hasn't been filled in at all), it means that
we'll need to get the lock and re-load it properly.

This is noticeable even on Nehalem, which does locking quite well (much
better than P4). From lmbench:

	Processor, Processes - times in microseconds - smaller is better
	--------------------------------------------------------------------
	Host                 OS  Mhz null null      open slct fork exec sh
	                             call  I/O stat clos TCP  proc proc proc
	--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----
 - before:
	nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.95 1.45 2.18 69.1 273. 1141
	nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.95 1.48 2.28 69.9 253. 1140
	nehalem.l Linux 2.6.30- 3193 0.04 0.10 0.95 1.42 2.19 68.6 284. 1141
 - after:
	nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.44 2.12 68.3 282. 1094
	nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.39 2.20 67.0 308. 1123
	nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.39 2.36 67.4 293. 1148

where you can see what appears to be a roughly 3% improvement in stat
and open/close latencies from just the removal of the locking overhead.

Of course, this only matters for files you don't own (the owner never
needs to do the ACL checks), but that's the common case for libraries,
header files, and executables. As well as for the base components of any
absolute pathname, even if you are the owner of the final file.

[ At some point we probably want to move this ACL caching logic entirely
  into the VFS layer (and only call down to the filesystem when
  uncached), but in the meantime this improves ext3 a bit.

  A similar fix to btrfs makes a much bigger difference (15x improvement
  in lmbench) due to broken caching. ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/acl.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d81ef2fdb08e..e0c745451715 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -129,12 +129,15 @@ fail:
 static inline struct posix_acl *
 ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
+	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
 
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT3_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *i_acl;
+		if (acl != EXT3_ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
 
 	return acl;
 }

From 8b0f9e8f78bd0a65fa001bf18f2c47eef2893a10 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Apr 2009 17:33:23 -0400
Subject: [PATCH 04/15] ext4: avoid unnecessary spinlock in critical POSIX ACL
 path

If a filesystem supports POSIX ACL's, the VFS layer expects the filesystem
to do POSIX ACL checks on any files not owned by the caller, and it does
this for every single pathname component that it looks up.

That obviously can be pretty expensive if the filesystem isn't careful
about it, especially with locking. That's doubly sad, since the common
case tends to be that there are no ACL's associated with the files in
question.

ext4 already caches the ACL data so that it doesn't have to look it up
over and over again, but it does so by taking the inode->i_lock spinlock
on every lookup. Which is a noticeable overhead even if it's a private
lock, especially on CPU's where the serialization is expensive (eg Intel
Netburst aka 'P4').

For the special case of not actually having any ACL's, all that locking is
unnecessary. Even if somebody else were to be changing the ACL's on
another CPU, we simply don't care - if we've seen a NULL ACL, we might as
well use it.

So just load the ACL speculatively without any locking, and if it was
NULL, just use it. If it's non-NULL (either because we had a cached
entry, or because the cache hasn't been filled in at all), it means that
we'll need to get the lock and re-load it properly.

(This commit was ported from a patch originally authored by Linus for
ext3.)

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/acl.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..605aeed96d68 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -129,12 +129,15 @@ fail:
 static inline struct posix_acl *
 ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
 
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT4_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *i_acl;
+		if (acl != EXT4_ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
 
 	return acl;
 }

From 748de6736c1e482e111f9d1b5a5d5b1787600cad Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Wed, 17 Jun 2009 19:24:03 -0400
Subject: [PATCH 05/15] ext4: online defrag -- Add EXT4_IOC_MOVE_EXT ioctl

The EXT4_IOC_MOVE_EXT exchanges the blocks between orig_fd and donor_fd,
and then write the file data of orig_fd to donor_fd.
ext4_mext_move_extent() is the main fucntion of ext4 online defrag,
and this patch includes all functions related to ext4 online defrag.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Makefile       |    2 +-
 fs/ext4/ext4.h         |   15 +
 fs/ext4/ext4_extents.h |    4 +
 fs/ext4/extents.c      |    4 +-
 fs/ext4/ioctl.c        |   36 ++
 fs/ext4/move_extent.c  | 1320 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1378 insertions(+), 3 deletions(-)
 create mode 100644 fs/ext4/move_extent.c

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8a34710ecf40..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7d5edc38c9..276a26f117e6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -352,6 +352,7 @@ struct ext4_new_group_data {
  /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -447,6 +448,15 @@ struct ext4_inode {
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
 };
 
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -1647,6 +1657,11 @@ extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+			     __u64 start_orig, __u64 start_donor,
+			     __u64 len, __u64 *moved_len);
+
 
 /*
  * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 }
 
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
 						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
 extern int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2593f748c3a4..50322a09bd01 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;
 
@@ -1417,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	return err;
 }
 
-static int
+int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..bb415408fdb6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -213,6 +214,41 @@ setversion_out:
 
 		return err;
 	}
+
+	case EXT4_IOC_MOVE_EXT: {
+		struct move_extent me;
+		struct file *donor_filp;
+		int err;
+
+		if (copy_from_user(&me,
+			(struct move_extent __user *)arg, sizeof(me)))
+			return -EFAULT;
+
+		donor_filp = fget(me.donor_fd);
+		if (!donor_filp)
+			return -EBADF;
+
+		if (!capable(CAP_DAC_OVERRIDE)) {
+			if ((current->real_cred->fsuid != inode->i_uid) ||
+				!(inode->i_mode & S_IRUSR) ||
+				!(donor_filp->f_dentry->d_inode->i_mode &
+				S_IRUSR)) {
+				fput(donor_filp);
+				return -EACCES;
+			}
+		}
+
+		err = ext4_move_extents(filp, donor_filp, me.orig_start,
+					me.donor_start, me.len, &me.moved_len);
+		fput(donor_filp);
+
+		if (!err)
+			if (copy_to_user((struct move_extent *)arg,
+				&me, sizeof(me)))
+				return -EFAULT;
+		return err;
+	}
+
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+
+#define get_ext_path(path, inode, block, ret)		\
+	do {								\
+		path = ext4_ext_find_extent(inode, block, path);	\
+		if (IS_ERR(path)) {					\
+			ret = PTR_ERR(path);				\
+			path = NULL;					\
+		}							\
+	} while (0)
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:	an extent for getting initialize status
+ * @dest:	an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+	if (ext4_ext_is_uninitialized(src))
+		ext4_ext_mark_uninitialized(dest);
+	else
+		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+		      struct ext4_extent **extent)
+{
+	int ppos, leaf_ppos = path->p_depth;
+
+	ppos = leaf_ppos;
+	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+		/* leaf block */
+		*extent = ++path[ppos].p_ext;
+		return 0;
+	}
+
+	while (--ppos >= 0) {
+		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+		    path[ppos].p_idx) {
+			int cur_ppos = ppos;
+
+			/* index block */
+			path[ppos].p_idx++;
+			path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+			if (path[ppos+1].p_bh)
+				brelse(path[ppos+1].p_bh);
+			path[ppos+1].p_bh =
+				sb_bread(inode->i_sb, path[ppos].p_block);
+			if (!path[ppos+1].p_bh)
+				return -EIO;
+			path[ppos+1].p_hdr =
+				ext_block_hdr(path[ppos+1].p_bh);
+
+			/* Halfway index block */
+			while (++cur_ppos < leaf_ppos) {
+				path[cur_ppos].p_idx =
+					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+				path[cur_ppos].p_block =
+					idx_pblock(path[cur_ppos].p_idx);
+				if (path[cur_ppos+1].p_bh)
+					brelse(path[cur_ppos+1].p_bh);
+				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+					path[cur_ppos].p_block);
+				if (!path[cur_ppos+1].p_bh)
+					return -EIO;
+				path[cur_ppos+1].p_hdr =
+					ext_block_hdr(path[cur_ppos+1].p_bh);
+			}
+
+			/* leaf block */
+			path[leaf_ppos].p_ext = *extent =
+				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			return 0;
+		}
+	}
+	/* We found the last extent */
+	return 1;
+}
+
+/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode:		original inode structure
+ * @donor_inode:	donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+	struct inode *first = orig_inode, *second = donor_inode;
+
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	/*
+	 * Use the inode number to provide the stable locking order instead
+	 * of its address, because the C language doesn't guarantee you can
+	 * compare pointers that don't come from the same array.
+	 */
+	if (donor_inode->i_ino < orig_inode->i_ino) {
+		first = donor_inode;
+		second = orig_inode;
+	}
+
+	down_read(&EXT4_I(first)->i_data_sem);
+	down_read(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode:		original inode structure
+ * @donor_inode:	donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+	struct inode *first = orig_inode, *second = donor_inode;
+
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	/*
+	 * Use the inode number to provide the stable locking order instead
+	 * of its address, because the C language doesn't guarantee you can
+	 * compare pointers that don't come from the same array.
+	 */
+	if (donor_inode->i_ino < orig_inode->i_ino) {
+		first = donor_inode;
+		second = orig_inode;
+	}
+
+	down_write(&EXT4_I(first)->i_data_sem);
+	down_write(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	up_read(&EXT4_I(orig_inode)->i_data_sem);
+	up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @o_start:		first original extent to be changed
+ * @o_end:		last original extent to be changed
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	ext4_lblk_t eblock = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+	int err = 0;
+
+	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+		if (o_start == o_end) {
+
+			/*       start_ext   new_ext    end_ext
+			 * donor |---------|-----------|--------|
+			 * orig  |------------------------------|
+			 */
+			end_flag = 1;
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * donor |---------|----------|---------|
+			 * orig  |---------------|--------------|
+			 */
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (start_ext->ee_len && new_ext->ee_len &&
+		   !end_ext->ee_len && o_start == o_end) {
+
+		/*	 start_ext	new_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (!start_ext->ee_len && new_ext->ee_len &&
+		   end_ext->ee_len && o_start == o_end) {
+
+		/*	  new_ext	end_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+		/*
+		 * Set 0 to the extent block if new_ext was
+		 * the first block.
+		 */
+		if (new_ext->ee_block)
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		ext4_debug("ext4 move extent: Unexpected insert case\n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		get_ext_path(orig_path, orig_inode, eblock, err);
+		if (orig_path == NULL)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					orig_path, new_ext))
+			goto out;
+	}
+
+	if (end_flag) {
+		get_ext_path(orig_path, orig_inode,
+				      le32_to_cpu(end_ext->ee_block) - 1, err);
+		if (orig_path == NULL)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					   orig_path, end_ext))
+			goto out;
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:		first original extent to be moved
+ * @o_end:		last original extent to be moved
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ * @eh:			extent header of target leaf block
+ * @range_to_move:	used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+			      struct ext4_extent *o_end,
+			      struct ext4_extent *start_ext,
+			      struct ext4_extent *new_ext,
+			      struct ext4_extent *end_ext,
+			      struct ext4_extent_header *eh,
+			      int range_to_move)
+{
+	int i = 0;
+	unsigned long len;
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+			(unsigned long)(o_end + 1);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (start_ext->ee_len)
+		o_start[i++].ee_len = start_ext->ee_len;
+
+	/* Insert new entry */
+	if (new_ext->ee_len) {
+		o_start[i] = *new_ext;
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len)
+		o_start[i] = *end_ext;
+
+	/* Increment the total entries counter on the extent block */
+	le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:	journal handle
+ * @orig_inode:	original inode
+ * @orig_path:	path indicates first extent to be changed
+ * @o_start:	first original extent to be changed
+ * @o_end:	last original extent to be changed
+ * @start_ext:	first new extent to be inserted
+ * @new_ext:	middle of new extent to be inserted
+ * @end_ext:	last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+			 struct ext4_ext_path *orig_path,
+			 struct ext4_extent *o_start,
+			 struct ext4_extent *o_end,
+			 struct ext4_extent *start_ext,
+			 struct ext4_extent *new_ext,
+			 struct ext4_extent *end_ext)
+{
+	struct  ext4_extent_header *eh;
+	unsigned long need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/*
+	 * The extents need to be inserted
+	 * start_extent + new_extent + end_extent.
+	 */
+	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+		(new_ext->ee_len ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+		/ sizeof(struct ext4_extent);
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = orig_path->p_depth;
+	orig_path += depth;
+	eh = orig_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+		if (ret)
+			return ret;
+	}
+
+	/* Expansion */
+	if (range_to_move > 0 &&
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+					o_end, start_ext, new_ext, end_ext);
+		if (ret < 0)
+			return ret;
+	} else
+		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+						end_ext, eh, range_to_move);
+
+	if (depth) {
+		ret = ext4_handle_dirty_metadata(handle, orig_inode,
+						 orig_path->p_bh);
+		if (ret)
+			return ret;
+	} else {
+		ret = ext4_mark_inode_dirty(handle, orig_inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @orig_path:		path indicates first extent to be changed
+ * @dext:		donor extent
+ * @from:		start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+		     ext4_lblk_t *from)
+{
+	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_lblk_t new_ext_end;
+	ext4_fsblk_t new_phys_end;
+	int oext_alen, new_ext_alen, end_ext_alen;
+	int depth = ext_depth(orig_inode);
+	int ret;
+
+	o_start = o_end = oext = orig_path[depth].p_ext;
+	oext_alen = ext4_ext_get_actual_len(oext);
+	start_ext.ee_len = end_ext.ee_len = 0;
+
+	new_ext.ee_block = cpu_to_le32(*from);
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+	new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+
+	/*
+	 * Case: original extent is first
+	 * oext      |--------|
+	 * new_ext      |--|
+	 * start_ext |--|
+	 */
+	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block) + oext_alen) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+					       le32_to_cpu(oext->ee_block));
+		copy_extent_status(oext, &start_ext);
+	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+		prev_ext = oext - 1;
+		/*
+		 * We can merge new_ext into previous extent,
+		 * if these are contiguous and same extent type.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+					       &new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+				ext4_ext_get_actual_len(prev_ext) +
+				new_ext_alen);
+			copy_extent_status(prev_ext, &start_ext);
+			new_ext.ee_len = 0;
+		}
+	}
+
+	/*
+	 * Case: new_ext_end must be less than oext
+	 * oext      |-----------|
+	 * new_ext       |-------|
+	 */
+	BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+
+	/*
+	 * Case: new_ext is smaller than original extent
+	 * oext    |---------------|
+	 * new_ext |-----------|
+	 * end_ext             |---|
+	 */
+	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+		end_ext.ee_len =
+			cpu_to_le16(le32_to_cpu(oext->ee_block) +
+			oext_alen - 1 - new_ext_end);
+		copy_extent_status(oext, &end_ext);
+		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+		ext4_ext_store_pblock(&end_ext,
+			(ext_pblock(o_end) + oext_alen - end_ext_alen));
+		end_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+			oext_alen - end_ext_alen);
+	}
+
+	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+				o_end, &start_ext, &new_ext, &end_ext);
+	return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:		the extent that will belong to the original inode
+ * @tmp_oext:		the extent that will belong to the donor inode
+ * @orig_off:		block offset of original inode
+ * @donor_off:		block offset of donor inode
+ * @max_count:		the maximun length of extents
+ */
+static void
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+			      struct ext4_extent *tmp_oext,
+			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+			      ext4_lblk_t max_count)
+{
+	ext4_lblk_t diff, orig_diff;
+	struct ext4_extent dext_old, oext_old;
+
+	dext_old = *tmp_dext;
+	oext_old = *tmp_oext;
+
+	/* When tmp_dext is too large, pick up the target range. */
+	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+	ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+	tmp_dext->ee_block =
+			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+	if (max_count < ext4_ext_get_actual_len(tmp_dext))
+		tmp_dext->ee_len = cpu_to_le16(max_count);
+
+	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+	ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+
+	/* Adjust extent length if donor extent is larger than orig */
+	if (ext4_ext_get_actual_len(tmp_dext) >
+	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+						orig_diff);
+
+	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+	copy_extent_status(&oext_old, tmp_dext);
+	copy_extent_status(&dext_old, tmp_oext);
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @from:		block offset of orig_inode
+ * @count:		block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+			   struct inode *donor_inode, ext4_lblk_t from,
+			   ext4_lblk_t count)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	struct ext4_ext_path *donor_path = NULL;
+	struct ext4_extent *oext, *dext;
+	struct ext4_extent tmp_dext, tmp_oext;
+	ext4_lblk_t orig_off = from, donor_off = from;
+	int err = 0;
+	int depth;
+	int replaced_count = 0;
+	int dext_alen;
+
+	mext_double_down_write(orig_inode, donor_inode);
+
+	/* Get the original extent for the block "orig_off" */
+	get_ext_path(orig_path, orig_inode, orig_off, err);
+	if (orig_path == NULL)
+		goto out;
+
+	/* Get the donor extent for the head */
+	get_ext_path(donor_path, donor_inode, donor_off, err);
+	if (donor_path == NULL)
+		goto out;
+	depth = ext_depth(orig_inode);
+	oext = orig_path[depth].p_ext;
+	tmp_oext = *oext;
+
+	depth = ext_depth(donor_inode);
+	dext = donor_path[depth].p_ext;
+	tmp_dext = *dext;
+
+	mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+				      donor_off, count);
+
+	/* Loop for the donor extents */
+	while (1) {
+		/* The extent for donor must be found. */
+		BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+
+		/* Set donor extent to orig extent */
+		err = mext_leaf_block(handle, orig_inode,
+					   orig_path, &tmp_dext, &orig_off);
+		if (err < 0)
+			goto out;
+
+		/* Set orig extent to donor extent */
+		err = mext_leaf_block(handle, donor_inode,
+					   donor_path, &tmp_oext, &donor_off);
+		if (err < 0)
+			goto out;
+
+		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+		replaced_count += dext_alen;
+		donor_off += dext_alen;
+		orig_off += dext_alen;
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		get_ext_path(orig_path, orig_inode, orig_off, err);
+		if (orig_path == NULL)
+			goto out;
+		depth = ext_depth(orig_inode);
+		oext = orig_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) +
+				ext4_ext_get_actual_len(oext) <= orig_off) {
+			err = 0;
+			goto out;
+		}
+		tmp_oext = *oext;
+
+		if (donor_path)
+			ext4_ext_drop_refs(donor_path);
+		get_ext_path(donor_path, donor_inode,
+				      donor_off, err);
+		if (donor_path == NULL)
+			goto out;
+		depth = ext_depth(donor_inode);
+		dext = donor_path[depth].p_ext;
+		if (le32_to_cpu(dext->ee_block) +
+				ext4_ext_get_actual_len(dext) <= donor_off) {
+			err = 0;
+			goto out;
+		}
+		tmp_dext = *dext;
+
+		mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					      donor_off,
+					      count - replaced_count);
+	}
+
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (donor_path) {
+		ext4_ext_drop_refs(donor_path);
+		kfree(donor_path);
+	}
+
+	mext_double_up_write(orig_inode, donor_inode);
+	return err;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:			file structure of original file
+ * @donor_inode:		donor inode
+ * @orig_page_offset:		page index on original file
+ * @data_offset_in_page:	block index where data swapping starts
+ * @block_len_in_page:		the number of blocks to be swapped
+ * @uninit:			orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+		  pgoff_t orig_page_offset, int data_offset_in_page,
+		  int block_len_in_page, int uninit)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct address_space *mapping = orig_inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page = NULL;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	ext4_lblk_t orig_blk_offset;
+	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+	unsigned int w_flags = 0;
+	unsigned int tmp_data_len, data_len;
+	void *fsdata;
+	int ret, i, jblocks;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+	handle = ext4_journal_start(orig_inode, jblocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	orig_blk_offset = orig_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	/*
+	 * If orig extent is uninitialized one,
+	 * it's not necessary force the page into memory
+	 * and then force it to be written out again.
+	 * Just swap data blocks between orig and donor.
+	 */
+	if (uninit) {
+		ret = mext_replace_branches(handle, orig_inode,
+						 donor_inode, orig_blk_offset,
+						 block_len_in_page);
+
+		/* Clear the inode cache not to refer to the old data */
+		ext4_ext_invalidate_cache(orig_inode);
+		ext4_ext_invalidate_cache(donor_inode);
+		goto out2;
+	}
+
+	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+	/* Calculate data_len */
+	if ((orig_blk_offset + block_len_in_page - 1) ==
+	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+		/* Replace the last block */
+		tmp_data_len = orig_inode->i_size & (blocksize - 1);
+		/*
+		 * If data_len equal zero, it shows data_len is multiples of
+		 * blocksize. So we set appropriate value.
+		 */
+		if (tmp_data_len == 0)
+			tmp_data_len = blocksize;
+
+		data_len = tmp_data_len +
+			((block_len_in_page - 1) << orig_inode->i_blkbits);
+	} else {
+		data_len = block_len_in_page << orig_inode->i_blkbits;
+	}
+
+	ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+				 &page, &fsdata);
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(o_filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call releasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple move extent processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+
+	ret = mext_replace_branches(handle, orig_inode, donor_inode,
+					 orig_blk_offset, block_len_in_page);
+	if (ret < 0)
+		goto out;
+
+	/* Clear the inode cache not to refer to the old data */
+	ext4_ext_invalidate_cache(orig_inode);
+	ext4_ext_invalidate_cache(donor_inode);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+	bh = page_buffers(page);
+	for (i = 0; i < data_offset_in_page; i++)
+		bh = bh->b_this_page;
+
+	for (i = 0; i < block_len_in_page; i++) {
+		ret = ext4_get_block(orig_inode,
+				(sector_t)(orig_blk_offset + i), bh, 0);
+		if (ret < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+			       page, fsdata);
+	page = NULL;
+
+out:
+	if (unlikely(page)) {
+		if (PageLocked(page))
+			unlock_page(page);
+		page_cache_release(page);
+	}
+out2:
+	ext4_journal_stop(handle);
+
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @orig_start:		logical start offset in block for orig
+ * @donor_start:	logical start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+			  struct inode *donor_inode, __u64 orig_start,
+			  __u64 donor_start, __u64 *len, __u64 moved_len)
+{
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent does not support swapfile */
+	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+		ext4_debug("ext4 move extent: The argument files should "
+			"not be swapfile [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Files should be in the same ext4 FS */
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* orig and donor should be different file */
+	if (orig_inode->i_ino == donor_inode->i_ino) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent supports only extent based file */
+	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+		ext4_debug("ext4 move extent: orig file is not extents "
+			"based file [ino:orig %lu]\n", orig_inode->i_ino);
+		return -EOPNOTSUPP;
+	} else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+		ext4_debug("ext4 move extent: donor file is not extents "
+			"based file [ino:donor %lu]\n", donor_inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+		ext4_debug("ext4 move extent: File size is 0 byte\n");
+		return -EINVAL;
+	}
+
+	/* Start offset should be same */
+	if (orig_start != donor_start) {
+		ext4_debug("ext4 move extent: orig and donor's start "
+			"offset are not same [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (moved_len) {
+		ext4_debug("ext4 move extent: moved_len should be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if ((orig_start > MAX_DEFRAG_SIZE) ||
+	    (donor_start > MAX_DEFRAG_SIZE) ||
+	    (*len > MAX_DEFRAG_SIZE) ||
+	    (orig_start + *len > MAX_DEFRAG_SIZE))  {
+		ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+			"[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (orig_inode->i_size > donor_inode->i_size) {
+		if (orig_start >= donor_inode->i_size) {
+			ext4_debug("ext4 move extent: orig start offset "
+			"[%llu] should be less than donor file size "
+			"[%lld] [ino:orig %lu, donor_inode %lu]\n",
+			orig_start, donor_inode->i_size,
+			orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > donor_inode->i_size) {
+			ext4_debug("ext4 move extent: End offset [%llu] should "
+				"be less than donor file size [%lld]."
+				"So adjust length from %llu to %lld "
+				"[ino:orig %lu, donor %lu]\n",
+				orig_start + *len, donor_inode->i_size,
+				*len, donor_inode->i_size - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = donor_inode->i_size - orig_start;
+		}
+	} else {
+		if (orig_start >= orig_inode->i_size) {
+			ext4_debug("ext4 move extent: start offset [%llu] "
+				"should be less than original file size "
+				"[%lld] [inode:orig %lu, donor %lu]\n",
+				 orig_start, orig_inode->i_size,
+				orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > orig_inode->i_size) {
+			ext4_debug("ext4 move extent: Adjust length "
+				"from %llu to %lld. Because it should be "
+				"less than original file size "
+				"[ino:orig %lu, donor %lu]\n",
+				*len, orig_inode->i_size - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = orig_inode->i_size - orig_start;
+		}
+	}
+
+	if (!*len) {
+		ext4_debug("ext4 move extent: len shoudld not be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+		if (inode1)
+			mutex_lock(&inode1->i_mutex);
+		else if (inode2)
+			mutex_lock(&inode2->i_mutex);
+		return;
+	}
+
+	if (inode1->i_ino < inode2->i_ino) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+	}
+}
+
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode that is released first
+ * @inode2:     the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1)
+		mutex_unlock(&inode1->i_mutex);
+
+	if (inode2 && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:		file structure of the original file
+ * @d_filp:		file structure of the donor file
+ * @orig_start:		start offset in block for orig
+ * @donor_start:	start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+		 __u64 orig_start, __u64 donor_start, __u64 len,
+		 __u64 *moved_len)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct inode *donor_inode = d_filp->f_dentry->d_inode;
+	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_start = orig_start;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	ext4_lblk_t rest_blocks;
+	pgoff_t orig_page_offset = 0, seq_end_page;
+	int ret, depth, last_extent = 0;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int data_offset_in_page;
+	int block_len_in_page;
+	int uninit;
+
+	/* protect orig and donor against a truncate */
+	mext_inode_double_lock(orig_inode, donor_inode);
+
+	mext_double_down_read(orig_inode, donor_inode);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+					donor_start, &len, *moved_len);
+	mext_double_up_read(orig_inode, donor_inode);
+	if (ret)
+		goto out2;
+
+	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+	block_end = block_start + len - 1;
+	if (file_end < block_end)
+		len -= block_end - file_end;
+
+	get_ext_path(orig_path, orig_inode, block_start, ret);
+	if (orig_path == NULL)
+		goto out2;
+
+	/* Get path structure to check the hole */
+	get_ext_path(holecheck_path, orig_inode, block_start, ret);
+	if (holecheck_path == NULL)
+		goto out;
+
+	depth = ext_depth(orig_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+	if (ext_cur == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Get proper extent whose ee_block is beyond block_start
+	 * if block_start was within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		last_extent = mext_next_extent(orig_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = mext_next_extent(orig_inode, orig_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+	}
+	seq_start = block_start;
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		ext4_debug("ext4 move extent: The specified range of file "
+							"may be the hole\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = mext_next_extent(orig_inode, holecheck_path,
+						&ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode,
+					       ext_prev, ext_cur) &&
+		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+		    !last_extent)
+			continue;
+
+		/* Is original extent is uninitialized */
+		uninit = ext4_ext_is_uninitialized(ext_prev);
+
+		data_offset_in_page = seq_start % blocks_per_page;
+
+		/*
+		 * Calculate data blocks count that should be swapped
+		 * at the first page.
+		 */
+		if (data_offset_in_page + seq_blocks > blocks_per_page) {
+			/* Swapped blocks are across pages */
+			block_len_in_page =
+					blocks_per_page - data_offset_in_page;
+		} else {
+			/* Swapped blocks are in a page */
+			block_len_in_page = seq_blocks;
+		}
+
+		orig_page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+		rest_blocks = seq_blocks;
+
+		/* Discard preallocations of two inodes */
+		down_write(&EXT4_I(orig_inode)->i_data_sem);
+		ext4_discard_preallocations(orig_inode);
+		up_write(&EXT4_I(orig_inode)->i_data_sem);
+
+		down_write(&EXT4_I(donor_inode)->i_data_sem);
+		ext4_discard_preallocations(donor_inode);
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+		while (orig_page_offset <= seq_end_page) {
+
+			/* Swap original branches with new branches */
+			ret = move_extent_par_page(o_filp, donor_inode,
+						orig_page_offset,
+						data_offset_in_page,
+						block_len_in_page, uninit);
+			if (ret < 0)
+				goto out;
+			orig_page_offset++;
+			/* Count how many blocks we have exchanged */
+			*moved_len += block_len_in_page;
+			BUG_ON(*moved_len > len);
+
+			data_offset_in_page = 0;
+			rest_blocks -= block_len_in_page;
+			if (rest_blocks > blocks_per_page)
+				block_len_in_page = blocks_per_page;
+			else
+				block_len_in_page = rest_blocks;
+		}
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		get_ext_path(holecheck_path, orig_inode,
+				      seq_start, ret);
+		if (holecheck_path == NULL)
+			break;
+		depth = holecheck_path->p_depth;
+
+		/* Decrease buffer counter */
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		get_ext_path(orig_path, orig_inode, seq_start, ret);
+		if (orig_path == NULL)
+			break;
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+		seq_blocks = 0;
+
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+out2:
+	mext_inode_double_unlock(orig_inode, donor_inode);
+
+	if (ret)
+		return ret;
+
+	/* All of the specified blocks must be exchanged in succeed */
+	BUG_ON(*moved_len != len);
+
+	return 0;
+}

From 7f4520cc6242780ce720aa440ad4b391f998b558 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Jun 2009 10:09:41 -0400
Subject: [PATCH 06/15] ext4: change s_mount_opt to be an unsigned int

We can only fit 32 options in s_mount_opt because an unsigned long is
32-bits on a x86 machine.  So use an unsigned int to save space on
64-bit platforms.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 2 +-
 fs/ext4/inode.c | 2 +-
 fs/ext4/super.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 276a26f117e6..569f527080bf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -852,7 +852,7 @@ struct ext4_sb_info {
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
-	unsigned long  s_mount_opt;
+	unsigned int s_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
 	gid_t s_resgid;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2418ad36eab5..f8325a2bc897 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -93,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 	BUFFER_TRACE(bh, "enter");
 
 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %lx\n",
+		  "data mode %x\n",
 		  bh, is_metadata, inode->i_mode,
 		  test_opt(inode->i_sb, DATA_FLAGS));
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e8f0b2af4607..4c364ae7aeb1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1655,7 +1655,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+				"bpg=%lu, ipg=%lu, mo=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),

From bc0b0d6d69ee9022f18ae264e62beb30ddeb322a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Jun 2009 10:09:48 -0400
Subject: [PATCH 07/15] ext4: update the s_last_mounted field in the superblock

This field can be very helpful when a system administrator is trying
to sort through large numbers of block devices or filesystem images.
What is stored in this field can be ambiguous if multiple filesystem
namespaces are in play; what we store in practice is the mountpoint
interpreted by the process's namespace which first opens a file in the
filesystem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h |  7 +++++++
 fs/ext4/file.c | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 569f527080bf..9e268c97eeca 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -834,6 +834,12 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+
+/*
+ * Mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED	0x0001
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -853,6 +859,7 @@ struct ext4_sb_info {
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
 	unsigned int s_mount_opt;
+	unsigned int s_mount_flags;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
 	gid_t s_resgid;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct vfsmount *mnt = filp->f_path.mnt;
+	struct path path;
+	char buf[64], *cp;
+
+	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+		     !(sb->s_flags & MS_RDONLY))) {
+		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+		/*
+		 * Sample where the filesystem has been mounted and
+		 * store it in the superblock for sysadmin convenience
+		 * when trying to sort through large numbers of block
+		 * devices or filesystem images.
+		 */
+		memset(buf, 0, sizeof(buf));
+		path.mnt = mnt->mnt_parent;
+		path.dentry = mnt->mnt_mountpoint;
+		path_get(&path);
+		cp = d_path(&path, buf, sizeof(buf));
+		path_put(&path);
+		if (!IS_ERR(cp)) {
+			memcpy(sbi->s_es->s_last_mounted, cp,
+			       sizeof(sbi->s_es->s_last_mounted));
+			sb->s_dirt = 1;
+		}
+	}
+	return generic_file_open(inode, filp);
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
-	.open		= generic_file_open,
+	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,

From 4ab2f15b7f709c3626a7eed075a7225b4c775c7e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Jun 2009 10:09:36 -0400
Subject: [PATCH 08/15] ext4: move the abort flag from s_mount_opts to
 s_mount_flags

We're running out of space in the mount options word, and
EXT4_MOUNT_ABORT isn't really a mount option, but a run-time flag.  So
move it to become EXT4_MF_FS_ABORTED in s_mount_flags.

Also remove bogus ext2_fs.h / ext4.h simultaneous #include protection,
which can never happen.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 11 ++---------
 fs/ext4/inode.c |  4 ++--
 fs/ext4/super.c | 10 +++++-----
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9e268c97eeca..06ee5a582917 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -684,7 +684,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
-#define EXT4_MOUNT_ABORT		0x00200	/* Fatal error detected */
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
@@ -706,17 +705,10 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 
-/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
 #define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
 					 EXT4_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD		EXT4_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT		EXT4_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS		EXT4_MOUNT_DATA_FLAGS
-#endif
 
 #define ext4_set_bit			ext2_set_bit
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
@@ -836,9 +828,10 @@ struct ext4_super_block {
 #ifdef __KERNEL__
 
 /*
- * Mount flags
+ * run-time mount flags
  */
 #define EXT4_MF_MNTDIR_SAMPLED	0x0001
+#define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
 
 /*
  * fourth extended-fs super-block data in memory
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f8325a2bc897..5f927f6a1289 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2670,13 +2670,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
-	 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_da_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
-	if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
 	/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4c364ae7aeb1..04486a53469f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -303,7 +303,7 @@ static void ext4_handle_error(struct super_block *sb)
 	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
-		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
@@ -416,7 +416,7 @@ void ext4_abort(struct super_block *sb, const char *function,
 	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
-	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 	if (EXT4_SB(sb)->s_journal)
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
@@ -1476,7 +1476,7 @@ set_qf_format:
 			break;
 #endif
 		case Opt_abort:
-			set_opt(sbi->s_mount_opt, ABORT);
+			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
 			break;
 		case Opt_nobarrier:
 			clear_opt(sbi->s_mount_opt, BARRIER);
@@ -3452,7 +3452,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3467,7 +3467,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
-		if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}

From 8a8a2050c844d9de224ff591e91bda3f77bd6eda Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Jun 2009 10:08:59 -0400
Subject: [PATCH 09/15] ext4: document the "abort" mount option

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/filesystems/ext4.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 608fdba97b72..7be02ac5fa36 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -235,6 +235,10 @@ minixdf			Make 'df' act like Minix.
 
 debug			Extra debugging information is sent to syslog.
 
+abort			Simulate the effects of calling ext4_abort() for
+			debugging purposes.  This is normally used while
+			remounting a filesystem which is already mounted.
+
 errors=remount-ro	Remount the filesystem read-only on an error.
 errors=continue		Keep going on a filesystem error.
 errors=panic		Panic and halt the machine if an error occurs.

From f157a4aa98a18bd3817a72bea90d48494e2586e7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Jun 2009 11:09:42 -0400
Subject: [PATCH 10/15] ext4: Use a hash of the topdir directory name for the
 Orlov parent group

Instead of using a random number to determine the goal parent grop for
the Orlov top directories, use a hash of the directory name.  This
allows for repeatable results when trying to benchmark filesystem
layout algorithms.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  3 ++-
 fs/ext4/ialloc.c  | 19 ++++++++++++++-----
 fs/ext4/migrate.c |  5 ++---
 fs/ext4/namei.c   |  8 ++++----
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 06ee5a582917..d035cf149e0e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1315,7 +1315,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+				    const struct qstr *qstr);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7d502f3be914..3f98ee712ff4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -470,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
  */
 
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-			    ext4_group_t *group, int mode)
+			    ext4_group_t *group, int mode,
+			    const struct qstr *qstr)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -485,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
 	int flex_size = ext4_flex_bg_size(sbi);
+	struct dx_hash_info hinfo;
 
 	ngroups = real_ngroups;
 	if (flex_size > 1) {
@@ -506,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		int best_ndir = inodes_per_group;
 		int ret = -1;
 
-		get_random_bytes(&grp, sizeof(grp));
+		if (qstr) {
+			hinfo.hash_version = DX_HASH_HALF_MD4;
+			hinfo.seed = sbi->s_hash_seed;
+			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			grp = hinfo.hash;
+		} else
+			get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
 			g = (parent_group + i) % ngroups;
@@ -649,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		*group = parent_group + flex_size;
 		if (*group > ngroups)
 			*group = 0;
-		return find_group_orlov(sb, parent, group, mode);
+		return find_group_orlov(sb, parent, group, mode, 0);
 	}
 
 	/*
@@ -790,7 +798,8 @@ err_ret:
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+			     const struct qstr *qstr)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -839,7 +848,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group, mode);
+			ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
 	} else
 		ret2 = find_group_other(sb, dir, &group, mode);
 
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..80d075b8aeaf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -483,9 +483,8 @@ int ext4_ext_migrate(struct inode *inode)
 		retval = PTR_ERR(handle);
 		return retval;
 	}
-	tmp_inode = ext4_new_inode(handle,
-				inode->i_sb->s_root->d_inode,
-				S_IFREG);
+	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				   S_IFREG, 0);
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07eb6649e4fa..5f00d2418a83 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode (handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2264,7 +2264,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;

From 11013911daea4820147ae6d7094dd7c6894e8651 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Sat, 13 Jun 2009 11:45:35 -0400
Subject: [PATCH 11/15] ext4: teach the inode allocator to use a goal inode
 number

Enhance the inode allocator to take a goal inode number as a
paremeter; if it is specified, it takes precedence over Orlov or
parent directory inode allocation algorithms.

The extents migration function uses the goal inode number so that the
extent trees allocated the migration function use the correct flex_bg.
In the future, the goal inode functionality will also be used to
allocate an adjacent inode for the extended attributes.

Also, for testing purposes the goal inode number can be specified via
/sys/fs/{dev}/inode_goal.  This can be useful for testing inode
allocation beyond 2^32 blocks on very large filesystems.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 Documentation/ABI/testing/sysfs-fs-ext4 | 10 ++++++++++
 fs/ext4/ext4.h                          |  3 ++-
 fs/ext4/ialloc.c                        | 16 ++++++++++++----
 fs/ext4/migrate.c                       |  5 ++++-
 fs/ext4/namei.c                         | 10 ++++++----
 fs/ext4/super.c                         |  2 ++
 6 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 4e79074de282..5fb709997d96 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -79,3 +79,13 @@ Description:
 		This file is read-only and shows the number of
 		kilobytes of data that have been written to this
 		filesystem since it was mounted.
+
+What:		/sys/fs/ext4/<disk>/inode_goal
+Date:		June 2008
+Contact:	"Theodore Ts'o" <tytso@mit.edu>
+Description:
+		Tuning parameter which (if non-zero) controls the goal
+		inode used by the inode allocator in p0reference to
+		all other allocation hueristics.  This is intended for
+		debugging use only, and should be 0 on production
+		systems.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d035cf149e0e..746cdcba969d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -863,6 +863,7 @@ struct ext4_sb_info {
 	int s_inode_size;
 	int s_first_ino;
 	unsigned int s_inode_readahead_blks;
+	unsigned int s_inode_goal;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
@@ -1316,7 +1317,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 
 /* ialloc.c */
 extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
-				    const struct qstr *qstr);
+				    const struct qstr *qstr, __u32 goal);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3f98ee712ff4..2f645732e3b7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -799,7 +799,7 @@ err_ret:
  * group to find a free inode.
  */
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
-			     const struct qstr *qstr)
+			     const struct qstr *qstr, __u32 goal)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -830,6 +830,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
 
+	if (!goal)
+		goal = sbi->s_inode_goal;
+
+	if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+		ret2 = 0;
+		goto got_group;
+	}
+
 	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
@@ -858,7 +868,7 @@ got_group:
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < ngroups; i++) {
+	for (i = 0; i < ngroups; i++, ino = 0) {
 		err = -EIO;
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -870,8 +880,6 @@ got_group:
 		if (!inode_bitmap_bh)
 			goto fail;
 
-		ino = 0;
-
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
 					      inode_bitmap_bh->b_data,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 80d075b8aeaf..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
 	struct inode *tmp_inode = NULL;
 	struct list_blocks_struct lb;
 	unsigned long max_entries;
+	__u32 goal;
 
 	/*
 	 * If the filesystem does not support extents, or the inode
@@ -483,8 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
 		retval = PTR_ERR(handle);
 		return retval;
 	}
+	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
 	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
-				   S_IFREG, 0);
+				   S_IFREG, 0, goal);
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5f00d2418a83..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+			       &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2264,7 +2265,8 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+			       &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04486a53469f..23013d303f81 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2206,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2218,6 +2219,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
 	ATTR_LIST(mb_max_to_scan),
 	ATTR_LIST(mb_min_to_scan),

From 0610b6e99939828b77eec020ead0e1f44cba38ca Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 15 Jun 2009 03:45:05 -0400
Subject: [PATCH 12/15] ext4: Fix 64-bit block type problem on 32-bit platforms

The function ext4_mb_free_blocks() was using an "unsigned long" to
pass a block number; this will cause 64-bit block numbers to get
truncated on x86 and other 32-bit platforms.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/ext4.h    | 2 +-
 fs/ext4/mballoc.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 746cdcba969d..17b9998680e3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1341,7 +1341,7 @@ extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
-		unsigned long, unsigned long, int, unsigned long *);
+		ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d98070b48fb..519a0a686d94 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4689,7 +4689,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
  * Main entry point into mballoc to free blocks
  */
 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count,
+			ext4_fsblk_t block, unsigned long count,
 			int metadata, unsigned long *freed)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -4715,11 +4715,11 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	    block + count > ext4_blocks_count(es)) {
 		ext4_error(sb, __func__,
 			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+			    "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug("freeing block %lu\n", block);
+	ext4_debug("freeing block %llu\n", block);
 	trace_ext4_free_blocks(inode, block, count, metadata);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
@@ -4761,7 +4761,7 @@ do_more:
 
 		ext4_error(sb, __func__,
 			   "Freeing blocks in system zone - "
-			   "Block = %lu, count = %lu", block, count);
+			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
 		goto error_return;
 	}

From de9a55b841132f7ae097f6e31ccebad2d5030cf5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 14 Jun 2009 17:45:34 -0400
Subject: [PATCH 13/15] ext4: Fix up whitespace issues in fs/ext4/inode.c

This is a pure cleanup patch.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 200 +++++++++++++++++++++++++-----------------------
 1 file changed, 103 insertions(+), 97 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5f927f6a1289..8d0908afbd5b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -81,7 +81,7 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
  * If the handle isn't valid we're not journaling so there's nothing to do.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, ext4_fsblk_t blocknr)
+		struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 
@@ -332,8 +332,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
  */
 
 static int ext4_block_to_path(struct inode *inode,
-			ext4_lblk_t i_block,
-			ext4_lblk_t offsets[4], int *boundary)
+			      ext4_lblk_t i_block,
+			      ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -365,9 +365,9 @@ static int ext4_block_to_path(struct inode *inode,
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %lu > max in inode %lu",
-				i_block + direct_blocks +
-				indirect_blocks + double_blocks, inode->i_ino);
+			     "block %lu > max in inode %lu",
+			     i_block + direct_blocks +
+			     indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
@@ -382,25 +382,25 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 
 	while (bref < p+max) {
 		blk = le32_to_cpu(*bref++);
-		if (blk && 
-		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+		if (blk &&
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
 			ext4_error(inode->i_sb, function,
 				   "invalid block reference %u "
 				   "in inode #%lu", blk, inode->i_ino);
- 			return -EIO;
- 		}
- 	}
- 	return 0;
+			return -EIO;
+		}
+	}
+	return 0;
 }
 
 
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+	__ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+	__ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
 			      EXT4_NDIR_BLOCKS)
 
 /**
@@ -450,7 +450,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 		bh = sb_getblk(sb, le32_to_cpu(p->key));
 		if (unlikely(!bh))
 			goto failure;
-                  
+
 		if (!bh_uptodate_or_lock(bh)) {
 			if (bh_submit_read(bh) < 0) {
 				put_bh(bh);
@@ -462,7 +462,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				goto failure;
 			}
 		}
-		
+
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -555,7 +555,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *	returns it.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-		Indirect *partial)
+				   Indirect *partial)
 {
 	/*
 	 * XXX need to get goal block from mballoc's data structures
@@ -577,7 +577,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  *	direct and indirect blocks.
  */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
-		int blocks_to_boundary)
+				 int blocks_to_boundary)
 {
 	unsigned int count = 0;
 
@@ -613,9 +613,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				int indirect_blks, int blks,
-				ext4_fsblk_t new_blocks[4], int *err)
+			     ext4_lblk_t iblock, ext4_fsblk_t goal,
+			     int indirect_blks, int blks,
+			     ext4_fsblk_t new_blocks[4], int *err)
 {
 	struct ext4_allocation_request ar;
 	int target, i;
@@ -686,10 +686,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	}
 	if (!*err) {
 		if (target == blks) {
-		/*
-		 * save the new block number
-		 * for the first direct block
-		 */
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
 			new_blocks[index] = current_block;
 		}
 		blk_allocated += ar.len;
@@ -731,9 +731,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, int indirect_blks,
-				int *blks, ext4_fsblk_t goal,
-				ext4_lblk_t *offsets, Indirect *branch)
+			     ext4_lblk_t iblock, int indirect_blks,
+			     int *blks, ext4_fsblk_t goal,
+			     ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -780,7 +780,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
-			for (i=1; i < num; i++)
+			for (i = 1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
@@ -823,7 +823,8 @@ failed:
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-			ext4_lblk_t block, Indirect *where, int num, int blks)
+			      ext4_lblk_t block, Indirect *where, int num,
+			      int blks)
 {
 	int i;
 	int err = 0;
@@ -924,9 +925,9 @@ err_out:
  * blocks.
  */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-				  ext4_lblk_t iblock, unsigned int maxblocks,
-				  struct buffer_head *bh_result,
-				  int flags)
+			       ext4_lblk_t iblock, unsigned int maxblocks,
+			       struct buffer_head *bh_result,
+			       int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -942,7 +943,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
-					&blocks_to_boundary);
+				   &blocks_to_boundary);
 
 	if (depth == 0)
 		goto out;
@@ -990,8 +991,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * Block out ext4_truncate while we alter the tree
 	 */
 	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-					&count, goal,
-					offsets + (partial - chain), partial);
+				&count, goal,
+				offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
@@ -1002,8 +1003,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
-					partial, indirect_blks, count);
-	else 
+					 partial, indirect_blks, count);
+	else
 		goto cleanup;
 
 	set_buffer_new(bh_result);
@@ -1175,7 +1176,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block, 
+		int ret = check_block_validity(inode, block,
 					       bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
@@ -1257,7 +1258,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block, 
+		int ret = check_block_validity(inode, block,
 					       bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
@@ -1408,8 +1409,7 @@ static int walk_page_buffers(handle_t *handle,
 
 	for (bh = head, block_start = 0;
 	     ret == 0 && (bh != head || !block_start);
-	     block_start = block_end, bh = next)
-	{
+	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
@@ -1450,7 +1450,7 @@ static int walk_page_buffers(handle_t *handle,
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
-					struct buffer_head *bh)
+				       struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
@@ -1458,15 +1458,15 @@ static int do_journal_get_write_access(handle_t *handle,
 }
 
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
- 	pgoff_t index;
+	pgoff_t index;
 	unsigned from, to;
 
 	trace_ext4_write_begin(inode, pos, len, flags);
@@ -1475,7 +1475,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
- 	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -1523,7 +1523,7 @@ retry:
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
 			vmtruncate(inode, inode->i_size);
-			/* 
+			/*
 			 * If vmtruncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
@@ -1550,9 +1550,9 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
 {
 	int i_size_changed = 0;
 	struct inode *inode = mapping->host;
@@ -1603,9 +1603,9 @@ static int ext4_generic_write_end(struct file *file,
  * buffers are managed internally.
  */
 static int ext4_ordered_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
@@ -1633,7 +1633,7 @@ static int ext4_ordered_write_end(struct file *file,
 
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1647,9 +1647,9 @@ static int ext4_ordered_write_end(struct file *file,
 }
 
 static int ext4_writeback_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				    struct address_space *mapping,
+				    loff_t pos, unsigned len, unsigned copied,
+				    struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
@@ -1675,7 +1675,7 @@ static int ext4_writeback_write_end(struct file *file,
 
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1688,9 +1688,9 @@ static int ext4_writeback_write_end(struct file *file,
 }
 
 static int ext4_journalled_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				     struct address_space *mapping,
+				     loff_t pos, unsigned len, unsigned copied,
+				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
@@ -1738,7 +1738,7 @@ static int ext4_journalled_write_end(struct file *file,
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1845,7 +1845,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
-						unsigned long offset)
+					     unsigned long offset)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
@@ -2854,8 +2854,8 @@ static int ext4_nonda_switch(struct super_block *sb)
 }
 
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
 {
 	int ret, retries = 0;
 	struct page *page;
@@ -2925,7 +2925,7 @@ out:
  * when write to the end of file but not require block allocation
  */
 static int ext4_da_should_update_i_disksize(struct page *page,
-					 unsigned long offset)
+					    unsigned long offset)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
@@ -2944,9 +2944,9 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 }
 
 static int ext4_da_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
@@ -3044,7 +3044,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * not strictly speaking necessary (and for users of
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
-	 * 
+	 *
 	 * ext4_da_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
@@ -3064,7 +3064,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
-	 * 
+	 *
 	 * For now, though, we'll cheat by calling filemap_flush(),
 	 * which will map the blocks, and start the I/O, but not
 	 * actually wait for the I/O to complete.
@@ -3200,7 +3200,7 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
  *
  */
 static int __ext4_normal_writepage(struct page *page,
-				struct writeback_control *wbc)
+				   struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 
@@ -3212,7 +3212,7 @@ static int __ext4_normal_writepage(struct page *page,
 }
 
 static int ext4_normal_writepage(struct page *page,
-				struct writeback_control *wbc)
+				 struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t size = i_size_read(inode);
@@ -3248,7 +3248,7 @@ static int ext4_normal_writepage(struct page *page,
 }
 
 static int __ext4_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
+				       struct writeback_control *wbc)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
@@ -3298,7 +3298,7 @@ out:
 }
 
 static int ext4_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
+				     struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t size = i_size_read(inode);
@@ -3401,8 +3401,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -3722,7 +3722,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
  *			(no partially truncated stuff there).  */
 
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-			ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
+				  ext4_lblk_t offsets[4], Indirect chain[4],
+				  __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
@@ -3778,8 +3779,10 @@ no_top:
  * than `count' because there can be holes in there.
  */
 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, ext4_fsblk_t block_to_free,
-		unsigned long count, __le32 *first, __le32 *last)
+			      struct buffer_head *bh,
+			      ext4_fsblk_t block_to_free,
+			      unsigned long count, __le32 *first,
+			      __le32 *last)
 {
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
@@ -3796,10 +3799,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/*
-	 * Any buffers which are on the journal will be in memory. We find
-	 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
-	 * on them.  We've already detached each block from the file, so
-	 * bforget() in jbd2_journal_forget() should be safe.
+	 * Any buffers which are on the journal will be in memory. We
+	 * find them on the hash table so jbd2_journal_revoke() will
+	 * run jbd2_journal_forget() on them.  We've already detached
+	 * each block from the file, so bforget() in
+	 * jbd2_journal_forget() should be safe.
 	 *
 	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
 	 */
@@ -4171,7 +4175,7 @@ void ext4_truncate(struct inode *inode)
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse (partial->bh);
+		brelse(partial->bh);
 		partial--;
 	}
 do_indirects:
@@ -4412,8 +4416,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT4_DIRSYNC_FL;
 }
+
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-					struct ext4_inode_info *ei)
+				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
@@ -4528,7 +4533,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-				 ei->i_state |= EXT4_STATE_XATTR;
+				ei->i_state |= EXT4_STATE_XATTR;
 		}
 	} else
 		ei->i_extra_isize = 0;
@@ -4547,7 +4552,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 
 	ret = 0;
 	if (ei->i_file_acl &&
-	    ((ei->i_file_acl < 
+	    ((ei->i_file_acl <
 	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
 	       EXT4_SB(sb)->s_gdb_count)) ||
 	     (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4562,15 +4567,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		     !ext4_inode_is_fast_symlink(inode)))
 			/* Validate extent which is part of inode */
 			ret = ext4_ext_check_inode(inode);
- 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
-	 	/* Validate block references which are part of inode */
+		/* Validate block references which are part of inode */
 		ret = ext4_check_inode_blockref(inode);
 	}
 	if (ret) {
- 		brelse(bh);
- 		goto bad_inode;
+		brelse(bh);
+		goto bad_inode;
 	}
 
 	if (S_ISREG(inode->i_mode)) {
@@ -4601,7 +4606,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	} else {
 		brelse(bh);
 		ret = -EIO;
-		ext4_error(inode->i_sb, __func__, 
+		ext4_error(inode->i_sb, __func__,
 			   "bogus i_mode (%o) for inode=%lu",
 			   inode->i_mode, inode->i_ino);
 		goto bad_inode;
@@ -4754,8 +4759,9 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
-	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
-		raw_inode->i_block[block] = ei->i_data[block];
+	} else
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
 
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
@@ -5109,7 +5115,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
-		struct inode *inode, struct ext4_iloc *iloc)
+			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 

From 4159175058987cb68aefd0e9eec2598b795363b4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 15 Jun 2009 03:41:23 -0400
Subject: [PATCH 14/15] ext4: Don't update ctime for non-extent-mapped inodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The VFS handles updating ctime, so we don't need to update the inode's
ctime in ext4_splace_branch() to update the direct or indirect blocks.
This was harmless when we did this in ext3, but in ext4, thanks to
delayed allocation, updating the ctime in ext4_splice_branch() can
cause the ctime to mysteriously jump when the blocks are finally
allocated.

Thanks to Björn Steinbrink for pointing out this problem on the git
mailing list.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d0908afbd5b..7c17ae275af4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -856,10 +856,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	}
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
-
-	inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
@@ -878,8 +874,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
-		 * Inode was dirtied above.
 		 */
+		ext4_mark_inode_dirty(handle, inode);
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;

From 536fc240e7147858255bdb08e7a999a3351a9fb4 Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 17 Jun 2009 20:08:51 -0400
Subject: [PATCH 15/15] jbd2: clean up jbd2_journal_try_to_free_buffers()

This patch reverts 3f31fddf, which is no longer needed because if a
race between freeing buffer and committing transaction functionality
occurs and dio gets error, currently dio falls back to buffered IO due
to the commit 6ccfa806.

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Cc: Mingming Cao <cmm@us.ibm.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 49 -------------------------------------------
 1 file changed, 49 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 996ffda06bf3..494501edba6b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1547,36 +1547,6 @@ out:
 	return;
 }
 
-/*
- * jbd2_journal_try_to_free_buffers() could race with
- * jbd2_journal_commit_transaction(). The later might still hold the
- * reference count to the buffers when inspecting them on
- * t_syncdata_list or t_locked_list.
- *
- * jbd2_journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * try to free that buffer.
- *
- * Called with journal->j_state_lock hold.
- */
-static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-	transaction_t *transaction;
-	tid_t tid;
-
-	spin_lock(&journal->j_state_lock);
-	transaction = journal->j_committing_transaction;
-
-	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
-		return;
-	}
-
-	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
-	jbd2_log_wait_commit(journal, tid);
-}
-
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
@@ -1649,25 +1619,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 
 	ret = try_to_free_buffers(page);
 
-	/*
-	 * There are a number of places where jbd2_journal_try_to_free_buffers()
-	 * could race with jbd2_journal_commit_transaction(), the later still
-	 * holds the reference to the buffers to free while processing them.
-	 * try_to_free_buffers() failed to free those buffers. Some of the
-	 * caller of releasepage() request page buffers to be dropped, otherwise
-	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
-	 *
-	 * So, if the caller of try_to_release_page() wants the synchronous
-	 * behaviour(i.e make sure buffers are dropped upon return),
-	 * let's wait for the current transaction to finish flush of
-	 * dirty data buffers, then try to free those buffers again,
-	 * with the journal locked.
-	 */
-	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-		jbd2_journal_wait_for_transaction_sync_data(journal);
-		ret = try_to_free_buffers(page);
-	}
-
 busy:
 	return ret;
 }