From c11e9fafb398411af7558fca913c2fa4a10b1f48 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 20 Jul 2007 11:24:53 -0700 Subject: [PATCH 1/9] ocfs2: Restrict inode changes in ocfs2_update_inode_atime() ocfs2_update_inode_atime() calls ocfs2_mark_inode_dirty() to push changes from the struct inode into the ocfs2 disk inode. The problem is, ocfs2_mark_inode_dirty() might change other fields, depending on what happened to the struct inode. Since we don't always have locking to serialize changes to other fields (like i_size, etc), just fix things up to only touch the atime field. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c4034f693e7b..992eb567cb4d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -187,6 +187,7 @@ int ocfs2_update_inode_atime(struct inode *inode, int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; mlog_entry_void(); @@ -197,11 +198,27 @@ int ocfs2_update_inode_atime(struct inode *inode, goto out; } + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * Don't use ocfs2_mark_inode_dirty() here as we don't always + * have i_mutex to guard against concurrent changes to other + * inode fields. + */ inode->i_atime = CURRENT_TIME; - ret = ocfs2_mark_inode_dirty(handle, inode, bh); + di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) mlog_errno(ret); +out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: mlog_exit(ret); From a00cce356b5592208e761525a48a25902322cce9 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 20 Jul 2007 11:28:30 -0700 Subject: [PATCH 2/9] ocfs2: use s_maxbytes directly in ocfs2_change_file_space() There's no need to recalculate things via ocfs2_max_file_offset() as we've already done that to fill s_maxbytes, so use that instead. We can also un-export ocfs2_max_file_offset() then. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 2 +- fs/ocfs2/super.c | 2 +- fs/ocfs2/super.h | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 992eb567cb4d..7e508e2942ca 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1533,7 +1533,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *di_bh = NULL; handle_t *handle; - unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits); + unsigned long long max_off = inode->i_sb->s_maxbytes; if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 200c7d4790dc..a100b488533e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -319,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode) /* From xfs_super.c:xfs_max_file_offset * Copyright (c) 2000-2004 Silicon Graphics, Inc. */ -unsigned long long ocfs2_max_file_offset(unsigned int blockshift) +static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) { unsigned int pagefactor = 1; unsigned int bitshift = BITS_PER_LONG - 1; diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 3b9cb3d0b008..783f5270f2a1 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -45,6 +45,4 @@ void __ocfs2_abort(struct super_block *sb, #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) -unsigned long long ocfs2_max_file_offset(unsigned int blockshift); - #endif /* OCFS2_SUPER_H */ From 7c08d70c69150148c14f02633855f1591219c37c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 20 Jul 2007 11:58:36 -0700 Subject: [PATCH 3/9] ocfs2: Fix some casting errors related to file writes ocfs2_align_clusters_to_page_index() needs to cast the clusters shift to pgoff_t and ocfs2_file_buffered_write() needs loff_t when calculating destination start for memcpy. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 2 +- fs/ocfs2/ocfs2.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7e508e2942ca..b1ae4c754157 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1959,7 +1959,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, } dst = kmap_atomic(page, KM_USER0); - memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); + memcpy(dst + (pos & (loff_t)(PAGE_CACHE_SIZE - 1)), buf, bytes); kunmap_atomic(dst, KM_USER0); flush_dcache_page(page); ocfs2_put_write_source(user_page); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 5cc90a40b3c5..58307853fb4a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -494,16 +494,16 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, /* * Find the 1st page index which covers the given clusters. */ -static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, +static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb, u32 clusters) { unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; - unsigned long index = clusters; + pgoff_t index = clusters; if (PAGE_CACHE_SHIFT > cbits) { - index = clusters >> (PAGE_CACHE_SHIFT - cbits); + index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits); } else if (PAGE_CACHE_SHIFT < cbits) { - index = clusters << (cbits - PAGE_CACHE_SHIFT); + index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT); } return index; From ce76fd30ce98cdaeb38dca0dfbb3fa6d2801c5ce Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 20 Jul 2007 12:02:14 -0700 Subject: [PATCH 4/9] ocfs2: check ia_size limits in setattr We have to manually check the requested truncate size as the check in vmtruncate() comes too late for Ocfs2. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b1ae4c754157..4ffa715be09c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1028,6 +1028,11 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } if (size_change && attr->ia_size != i_size_read(inode)) { + if (attr->ia_size > sb->s_maxbytes) { + status = -EFBIG; + goto bail_unlock; + } + if (i_size_read(inode) > attr->ia_size) status = ocfs2_truncate_file(inode, bh, attr->ia_size); else From 5a25403175b8a945e93fc9c64ae9cf54f5730add Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 20 Jul 2007 12:56:16 -0700 Subject: [PATCH 5/9] ocfs2: Fix max offset calculations ocfs2_max_file_offset() was over-estimating the largest file size for several cases. This wasn't really a problem before, but now that we support sparse files, it needs to be more accurate. Signed-off-by: Mark Fasheh --- fs/ocfs2/super.c | 66 ++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a100b488533e..c3c89e26567c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -316,39 +316,51 @@ static void ocfs2_destroy_inode(struct inode *inode) kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } -/* From xfs_super.c:xfs_max_file_offset - * Copyright (c) 2000-2004 Silicon Graphics, Inc. - */ -static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) +static unsigned long long ocfs2_max_file_offset(unsigned int bbits, + unsigned int cbits) { - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; + unsigned int bytes = 1 << cbits; + unsigned int trim = bytes; + unsigned int bitshift = 32; - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_prepare_write does this in an [unsigned] long... - * page->index << (PAGE_CACHE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - * Note1: get_block_t takes a long (implicit cast from above) - * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch - * can optionally convert the [unsigned] long from above into - * an [unsigned] long long. + /* + * i_size and all block offsets in ocfs2 are always 64 bits + * wide. i_clusters is 32 bits, in cluster-sized units. So on + * 64 bit platforms, cluster size will be the limiting factor. */ #if BITS_PER_LONG == 32 # if defined(CONFIG_LBD) BUILD_BUG_ON(sizeof(sector_t) != 8); - pagefactor = PAGE_CACHE_SIZE; - bitshift = BITS_PER_LONG; + /* + * We might be limited by page cache size. + */ + if (bytes > PAGE_CACHE_SIZE) { + bytes = PAGE_CACHE_SIZE; + trim = 1; + /* + * Shift by 31 here so that we don't get larger than + * MAX_LFS_FILESIZE + */ + bitshift = 31; + } # else - pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); + /* + * We are limited by the size of sector_t. Use block size, as + * that's what we expose to the VFS. + */ + bytes = 1 << bbits; + trim = 1; + bitshift = 31; # endif #endif - return (((unsigned long long)pagefactor) << bitshift) - 1; + /* + * Trim by a whole cluster when we can actually approach the + * on-disk limits. Otherwise we can overflow i_clusters when + * an extent start is at the max offset. + */ + return (((unsigned long long)bytes) << bitshift) - trim; } static int ocfs2_remount(struct super_block *sb, int *flags, char *data) @@ -1259,8 +1271,8 @@ static int ocfs2_initialize_super(struct super_block *sb, int sector_size) { int status = 0; - int i; - struct ocfs2_dinode *di = NULL; + int i, cbits, bbits; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct buffer_head *bitmap_bh = NULL; struct ocfs2_journal *journal; @@ -1281,7 +1293,9 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_export_op = &ocfs2_export_ops; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ - sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); + cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); + bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); osb->sb = sb; /* Save off for ocfs2_rw_direct */ @@ -1341,8 +1355,6 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - di = (struct ocfs2_dinode *)bh->b_data; - osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { mlog(ML_ERROR, "Invalid number of node slots (%u)\n", From 6a18380e7ddd7d1a0493efe3be6475dd92323364 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 23 Jul 2007 10:01:21 +0200 Subject: [PATCH 6/9] [2.6 patch] ocfs2_insert_extent(): remove dead code This patch removes some now dead code. Spotted by the Coverity checker. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f5e11f4fa952..4f517665c9a0 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3731,7 +3731,6 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, { int status; struct buffer_head *last_eb_bh = NULL; - struct buffer_head *bh = NULL; struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; @@ -3783,9 +3782,6 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, ocfs2_extent_map_insert_rec(inode, &rec); bail: - if (bh) - brelse(bh); - if (last_eb_bh) brelse(last_eb_bh); From 480214d71f1972756473415d31953647952400fb Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 6 Aug 2007 15:11:56 -0700 Subject: [PATCH 7/9] ocfs2: Fix rename/extend race If one process is extending a file while another is renaming it, there exists a window when rename could flush the old inode's stale i_size to disk. This patch recognizes the fact that rename is only updating the old inode's ctime, so it ensures only that value is flushed to disk. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d430fdab16e9..701e6d04ed5d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1080,6 +1080,7 @@ static int ocfs2_rename(struct inode *old_dir, struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, // this is the 1st dirent bh nlink_t old_dir_nlink = old_dir->i_nlink; + struct ocfs2_dinode *old_di; /* At some point it might be nice to break this function up a * bit. */ @@ -1354,7 +1355,20 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - ocfs2_mark_inode_dirty(handle, old_inode, old_inode_bh); + + status = ocfs2_journal_access(handle, old_inode, old_inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status >= 0) { + old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; + + old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); + old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + + status = ocfs2_journal_dirty(handle, old_inode_bh); + if (status < 0) + mlog_errno(status); + } else + mlog_errno(status); /* now that the name has been added to new_dir, remove the old name */ status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); From ce17204ae633001ef41318d487282730e96b9522 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Mon, 30 Jul 2007 11:02:50 -0700 Subject: [PATCH 8/9] ocfs2: Retry sendpage() if it returns EAGAIN Instead of treating EAGAIN, returned from sendpage(), as an error, this patch retries the operation. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/tcp.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index f0bdfd944c44..685c18065c82 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -854,17 +854,25 @@ static void o2net_sendpage(struct o2net_sock_container *sc, struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); ssize_t ret; - - mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, - size, MSG_DONTWAIT); - mutex_unlock(&sc->sc_send_lock); - if (ret != size) { + while (1) { + mutex_lock(&sc->sc_send_lock); + ret = sc->sc_sock->ops->sendpage(sc->sc_sock, + virt_to_page(kmalloced_virt), + (long)kmalloced_virt & ~PAGE_MASK, + size, MSG_DONTWAIT); + mutex_unlock(&sc->sc_send_lock); + if (ret == size) + break; + if (ret == (ssize_t)-EAGAIN) { + mlog(0, "sendpage of size %zu to " SC_NODEF_FMT + " returned EAGAIN\n", size, SC_NODEF_ARGS(sc)); + cond_resched(); + continue; + } mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); o2net_ensure_shutdown(nn, sc, 0); + break; } } From e0dceaf0a4b8c55076a4dbcba7ac8b05755f5cc6 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 9 Aug 2007 16:52:30 -0700 Subject: [PATCH 9/9] ocfs2: set non-default s_time_gran during mount We need to manually set this to '1' during mount, otherwise inode_setattr() will chop off the nanosecond portion of our timestamps. Signed-off-by: Mark Fasheh --- fs/ocfs2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c3c89e26567c..f2fc9a795deb 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1291,6 +1291,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);