diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 0bfafe108357..5a5a05582b58 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -228,30 +228,19 @@ default behaviour. Deprecated Mount Options ======================== - delaylog/nodelaylog - Delayed logging is the only logging method that XFS supports - now, so these mount options are now ignored. +None at present. - Due for removal in 3.12. - ihashsize=value - In memory inode hashes have been removed, so this option has - no function as of August 2007. Option is deprecated. +Removed Mount Options +===================== - Due for removal in 3.12. + Name Removed + ---- ------- + delaylog/nodelaylog v3.20 + ihashsize v3.20 + irixsgid v3.20 + osyncisdsync/osyncisosync v3.20 - irixsgid - This behaviour is now controlled by a sysctl, so the mount - option is ignored. - - Due for removal in 3.12. - - osyncisdsync - osyncisosync - O_SYNC and O_DSYNC are fully supported, so there is no need - for these options any more. - - Due for removal in 3.12. sysctls ======= diff --git a/fs/open.c b/fs/open.c index 6796f04d6032..98e5a52dc68c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ @@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; + /* Insert range should only be used exclusively. */ + if ((mode & FALLOC_FL_INSERT_RANGE) && + (mode & ~FALLOC_FL_INSERT_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a6fbf4472017..516162be1398 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -260,6 +260,7 @@ xfs_alloc_fix_len( rlen = rlen - (k - args->mod); else rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ if ((int)rlen < (int)args->minlen) return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); @@ -286,7 +287,8 @@ xfs_alloc_fix_minleft( if (diff >= 0) return 1; args->len += diff; /* shrink the allocated space */ - if (args->len >= args->minlen) + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) return 1; args->agbno = NULLAGBLOCK; return 0; @@ -315,6 +317,9 @@ xfs_alloc_fixup_trees( xfs_agblock_t nfbno2; /* second new free startblock */ xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; /* * Look up the record in the by-size tree if necessary. @@ -323,13 +328,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Look up the record in the by-block tree if necessary. @@ -338,13 +343,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } #ifdef DEBUG @@ -355,7 +360,7 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif @@ -386,25 +391,25 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Fix up the by-block btree entry(s). @@ -415,7 +420,7 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } else { /* * Update the by-block entry to start later|be shorter. @@ -429,10 +434,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } return 0; } @@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(fbno <= args->agbno); /* @@ -783,7 +788,7 @@ xfs_alloc_find_best_extent( error = xfs_alloc_get_rec(*scur, sbno, slen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* @@ -946,7 +951,7 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (ltlen >= args->minlen) break; if ((error = xfs_btree_increment(cnt_cur, 0, &i))) @@ -966,7 +971,7 @@ xfs_alloc_ag_vextent_near( */ if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena < args->minlen) @@ -999,7 +1004,7 @@ xfs_alloc_ag_vextent_near( cnt_cur->bc_ptrs[0] = besti; if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { @@ -1088,7 +1093,7 @@ xfs_alloc_ag_vextent_near( if (bno_cur_lt) { if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena >= args->minlen) @@ -1104,7 +1109,7 @@ xfs_alloc_ag_vextent_near( if (bno_cur_gt) { if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); if (gtlena >= args->minlen) @@ -1303,7 +1308,7 @@ xfs_alloc_ag_vextent_size( error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); @@ -1342,7 +1347,7 @@ xfs_alloc_ag_vextent_size( * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen < args->maxlen) { xfs_agblock_t bestfbno; @@ -1362,13 +1367,13 @@ xfs_alloc_ag_vextent_size( if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen > bestrlen) { @@ -1383,7 +1388,7 @@ xfs_alloc_ag_vextent_size( if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1408,7 +1413,7 @@ xfs_alloc_ag_vextent_size( if (!xfs_alloc_fix_minleft(args)) goto out_nominleft; rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1422,7 +1427,7 @@ xfs_alloc_ag_vextent_size( cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small( if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); } /* * Nothing in the btree, try the freelist. Make sure @@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small( } args->len = 1; args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1579,7 +1584,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1591,7 +1596,8 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); } } /* @@ -1606,7 +1612,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1618,7 +1624,7 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); } } /* @@ -1635,31 +1641,31 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1672,7 +1678,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(mp, i == 1 && xxbno == ltbno && xxlen == ltlen, error0); } @@ -1695,17 +1701,17 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1721,10 +1727,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1743,7 +1749,7 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1752,10 +1758,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15105dbc9e28..04e79d57bca6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + void xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { @@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); - to->firstused = be16_to_cpu(hdr3->firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); - to->firstused = be16_to_cpu(from->hdr.firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk( void xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { - int i; + int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); @@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk( hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); - hdr3->firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; @@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk( to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); - to->hdr.firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; @@ -178,7 +254,7 @@ xfs_attr3_leaf_verify( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr3_icleaf_hdr ichdr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; @@ -757,9 +833,10 @@ xfs_attr_shortform_allfit( struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); @@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform( memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ @@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node( btree = dp->d_ops->node_tree_p(node); leaf = bp2->b_addr; - xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ @@ -988,7 +1065,7 @@ xfs_attr3_leaf_create( } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; @@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add( trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); @@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add( tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact( ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ - xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, @@ -1344,9 +1421,10 @@ xfs_attr_leaf_order( { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } @@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance( ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; @@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance( ichdr1.count, count); } - xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); - xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); @@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall( */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; @@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall( if (error) return error; - xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - @@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove( trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); @@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove( tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; - if (!ichdr.firstused) - ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; + ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); - xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* @@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance( tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ - xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { @@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance( kmem_free(tmp_leaf); } - xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); @@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int( trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); ASSERT(ichdr.count < args->geo->blksize / 8); @@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue( int valuelen; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); @@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash( { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; @@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag( ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); @@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag( leaf = bp->b_addr; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif @@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags( entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index e2929da7c3ba..025c4b820c03 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); -void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 61ec015dca16..aeffeaaac0ec 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset( } } -/* - * Debug/sanity checking code - */ - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && - block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) - return 0; - - if (be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - - return 1; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents( goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; @@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree( if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -1311,14 +1285,12 @@ xfs_bmap_read_extents( if (error) return error; block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -1345,9 +1317,6 @@ xfs_bmap_read_extents( XFS_ERRLEVEL_LOW, ip->i_mount, block); goto error0; } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); /* * Read-ahead the next leaf block, if any. */ @@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); ASSERT(bma->idx >= 0); @@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); if (diff > 0) { - error = xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0); + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); ASSERT(!error); if (error) goto done; @@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real( temp += bma->cur->bc_private.b.allocated; ASSERT(temp <= da_old); if (temp < da_old) - xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - (int64_t)(da_old - temp), 0); + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); } /* clear out the allocated field, done with it now in any case. */ @@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; *logflagsp = 0; @@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, @@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), 0); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); /* * Nothing to do for disk quota accounting here. */ @@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; } @@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc( ASSERT(indlen > 0); if (rt) { - error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); + error = xfs_mod_frextents(mp, -((int64_t)extsz)); } else { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); } if (error) goto out_unreserve_quota; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); if (error) goto out_unreserve_blocks; @@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc( out_unreserve_blocks: if (rt) - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + xfs_mod_frextents(mp, extsz); else - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? @@ -4801,7 +4770,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } da_old = da_new = 0; } else { @@ -4835,7 +4804,7 @@ xfs_bmap_del_extent( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; case 2: @@ -4935,7 +4904,8 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); /* * Update the btree record back * to the original value. @@ -4956,7 +4926,7 @@ xfs_bmap_del_extent( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5012,10 +4982,8 @@ xfs_bmap_del_extent( * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5284,14 +5252,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, 0); + xfs_mod_frextents(mp, (int64_t)rtexts); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, 0); + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5453,6 +5420,7 @@ xfs_bmse_merge( struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; + struct xfs_mount *mp = ip->i_mount; xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); @@ -5487,19 +5455,19 @@ xfs_bmse_merge( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); left.br_blockcount = blockcount; @@ -5518,50 +5486,92 @@ xfs_bmse_shift_one( int *current_ext, struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, - int *logflags) + int *logflags, + enum shift_direction direction) { struct xfs_ifork *ifp; + struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_rec_host *adj_irecp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_bmbt_irec adj_irec; int error; int i; + int total_extents; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(gotp, &got); - startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - /* - * Check for merge if we've got an extent to the left, otherwise make - * sure there's enough room at the start of the file for the shift. - */ - if (*current_ext) { - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; - if (startoff < left.br_startoff + left.br_blockcount) + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, leftp, cur, - logflags); + *current_ext, gotp, adj_irecp, + cur, logflags); } - } else if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ - (*current_ext)++; +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; if (!cur) { @@ -5573,18 +5583,18 @@ xfs_bmse_shift_one( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + got.br_blockcount, got.br_state); } /* - * Shift extent records to the left to cover a hole. + * Shift extent records to the left/right to cover/create a hole. * * The maximum number of extents to be shifted in a single operation is - * @num_exts. @start_fsb specifies the file offset to start the shift and the + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb * is the length by which each extent is shifted. If there is no hole to shift * the extents into, this will be considered invalid operation and we abort @@ -5594,12 +5604,13 @@ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, - xfs_fileoff_t *next_fsb, + xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + enum shift_direction direction, int num_exts) { struct xfs_btree_cur *cur = NULL; @@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents( struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; int error = 0; int whichfork = XFS_DATA_FORK; int logflags = 0; - int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5644,44 +5658,84 @@ xfs_bmap_shift_extents( cur->bc_private.b.flags = 0; } - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * start_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); - if (!gotp) { - *done = 1; - goto del_cursor; - } - /* * There may be delalloc extents in the data fork before the range we * are collapsing out, so we cannot use the count of real extents here. * Instead we have to calculate it from the incore fork. */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && current_ext < total_extents) { + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * *next_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + goto del_cursor; + } + + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags); + ¤t_ext, gotp, cur, &logflags, + direction); if (error) goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } - /* update total extent count and grab the next record */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (current_ext >= total_extents) + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; break; + } gotp = xfs_iext_get_ext(ifp, current_ext); } - /* Check if we are done */ - if (current_ext == total_extents) { - *done = 1; - } else if (next_fsb) { + if (!*done) { xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; } @@ -5696,3 +5750,189 @@ xfs_bmap_shift_extents( return error; } + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b9d8a499d2c4..6aaa0c1c7200 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) */ #define XFS_BMAP_MAX_SHIFT_EXTENTS 1 +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int num_exts); + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 81cad433df85..c72283dd8d44 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -168,7 +168,7 @@ xfs_btree_check_lptr( xfs_fsblock_t bno, /* btree block disk address */ int level) /* btree block level */ { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLFSBLOCK && XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); @@ -187,7 +187,7 @@ xfs_btree_check_sptr( { xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLAGBLOCK && bno != 0 && @@ -1825,7 +1825,7 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; @@ -2285,7 +2285,7 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) @@ -3138,7 +3138,7 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); level++; /* @@ -3582,15 +3582,15 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3634,12 +3634,12 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); } } @@ -3653,13 +3653,13 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9cb0115c6bd1..2385f8cd08ab 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -538,12 +538,12 @@ xfs_da3_root_split( oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); btree = dp->d_ops->node_tree_p(oldroot); - size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); - level = nodehdr.level; + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; /* * we are about to copy oldroot to bp, so set up the type diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 0a49b0286372..74bcbabfa523 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr { __uint16_t magic; __uint16_t count; __uint16_t usedbytes; - __uint16_t firstused; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; __u8 holes; struct { __uint16_t base; @@ -733,6 +739,12 @@ struct xfs_attr3_icleaf_hdr { } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; +/* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + /* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 5ff31be9b1cd..de1ea16f5748 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -89,7 +89,7 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -107,21 +107,21 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. @@ -134,18 +134,18 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(lastfree == 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(dup->length) <= be16_to_cpu(bf[2].length)); } @@ -160,13 +160,13 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; @@ -183,14 +183,15 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { @@ -198,13 +199,13 @@ __xfs_dir3_data_check( cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); } - XFS_WANT_CORRUPTED_RETURN(count == + XFS_WANT_CORRUPTED_RETURN(mp, count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); } return 0; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8eb718979383..4daaa662337b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -264,68 +264,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \ - XFS_SB_MVAL(BAD_FEATURES2)) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \ - XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \ - XFS_SB_PQUOTINO) - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 116ef1ddb3e3..07349a183a11 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + percpu_counter_read(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* @@ -700,7 +701,7 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -724,7 +725,7 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(j == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -944,19 +945,19 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } alloc_inode: @@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); /* * See if we've landed in the parent inode record. The finobt @@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); } - XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } } @@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } @@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); return xfs_inobt_update(cur, &rec); @@ -1340,7 +1341,8 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { + percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > + mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1475,14 +1477,14 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Get the offset in the inode chunk. */ @@ -1592,7 +1594,7 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, ibtrec->ir_free, &i); @@ -1613,12 +1615,12 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && (rec.ir_freecount == ibtrec->ir_freecount), error); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b0a5fe95a3e2..dc4bfc5d88fc 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -111,14 +111,6 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -743,17 +735,15 @@ xfs_initialize_perag_data( btree += pag->pagf_btreeblks; xfs_perag_put(pag); } - /* - * Overwrite incore superblock counters with just-read data - */ + + /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = bfree + bfreelst + btree; spin_unlock(&mp->m_sb_lock); - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); return 0; } @@ -771,6 +761,10 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 1d8eef9cf0f5..a56960dd1684 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1232,6 +1232,117 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1320,31 +1431,37 @@ __xfs_get_blocks( xfs_iunlock(ip, lockmode); } - - trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); } else if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK) { - /* - * For unwritten extents do not report a disk address on - * the read case (treat as if we're reading into a hole). - */ - if (create || !ISUNWRITTEN(&imap)) - xfs_map_buffer(inode, bh_result, &imap, offset); - if (create && ISUNWRITTEN(&imap)) { - if (direct) { - bh_result->b_private = inode; - set_buffer_defer_completion(bh_result); - } + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); - } + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); } /* @@ -1377,39 +1494,6 @@ __xfs_get_blocks( } } - /* - * If this is O_DIRECT or the mpage code calling tell them how large - * the mapping is, so that we can avoid repeated get_blocks calls. - * - * If the mapping spans EOF, then we have to break the mapping up as the - * mapping for blocks beyond EOF must be marked new so that sub block - * regions can be correctly zeroed. We can't do this for mappings within - * EOF unless the mapping was just allocated or is unwritten, otherwise - * the callers would overwrite existing data with zeros. Hence we have - * to split the mapping into a range up to and including EOF, and a - * second mapping for beyond EOF. - */ - if (direct || size > (1 << inode->i_blkbits)) { - xfs_off_t mapping_size; - - mapping_size = imap.br_startoff + imap.br_blockcount - iblock; - mapping_size <<= inode->i_blkbits; - - ASSERT(mapping_size > 0); - if (mapping_size > size) - mapping_size = size; - if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { - /* limit mapping to block that spans EOF */ - mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); - } - if (mapping_size > LONG_MAX) - mapping_size = LONG_MAX; - - bh_result->b_size = mapping_size; - } - return 0; out_unlock: @@ -1440,9 +1524,11 @@ xfs_get_blocks_direct( /* * Complete a direct I/O write request. * - * If the private argument is non-NULL __xfs_get_blocks signals us that we - * need to issue a transaction to convert the range from unwritten to written - * extents. + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). */ STATIC void xfs_end_io_direct_write( @@ -1454,43 +1540,71 @@ xfs_end_io_direct_write( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } if (XFS_FORCED_SHUTDOWN(mp)) - return; + goto out_end_io; /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); /* - * For direct I/O we do not know if we need to allocate blocks or not, - * so we can't preallocate an append transaction, as that results in - * nested reservations and log space deadlocks. Hence allocate the - * transaction here. While this is sub-optimal and can block IO - * completion for some time, we're stuck with doing it this way until - * we can pass the ioend to the direct IO allocation callbacks and - * avoid nesting that way. + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. */ - if (private && size > 0) { - xfs_iomap_write_unwritten(ip, offset, size); - } else if (offset + size > ip->i_d.di_size) { - struct xfs_trans *tp; - int error; + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return; - } - - xfs_setfilesize(ip, tp, offset, size); - } +out_end_io: + xfs_end_io(&ioend->io_work); + return; } STATIC ssize_t diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 83af4c149635..f9c1c64782d3 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive( int size; int tmp; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* * Count the number of "remote" value extents. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a43d370d2c58..65fb37a18e92 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; trace_xfs_attr_node_list(context); @@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { @@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_trans_brelse(NULL, bp); return error; } - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; @@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int( struct xfs_attr_leaf_entry *entry; int retval; int i; + struct xfs_mount *mp = context->dp->i_mount; trace_xfs_attr_list_leaf(context); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); cursor = context->cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 22a5dcb70b32..a52bbd3abc7d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1376,22 +1376,19 @@ xfs_zero_file_space( } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) { int done = 0; struct xfs_mount *mp = ip->i_mount; @@ -1400,21 +1397,26 @@ xfs_collapse_file_space( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; - xfs_fileoff_t start_fsb; + xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - trace_xfs_collapse_file_space(ip); + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } - next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. @@ -1427,20 +1429,28 @@ xfs_collapse_file_space( /* * Writeback and invalidate cache for the remainder of the file as we're - * about to shift down every extent from the collapse range to EOF. The - * free of the collapse range above might have already done some of - * this, but we shouldn't rely on it to do anything outside of the range - * that was freed. + * about to shift down every extent from offset to EOF. */ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset + len, -1); + offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - (offset + len) >> PAGE_CACHE_SHIFT, -1); + offset >> PAGE_CACHE_SHIFT, -1); if (error) return error; + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* @@ -1464,7 +1474,7 @@ xfs_collapse_file_space( if (error) goto out; - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&free_list, &first_block); @@ -1472,10 +1482,9 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - start_fsb = next_fsb; - error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, - &done, &next_fsb, &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; @@ -1484,17 +1493,69 @@ xfs_collapse_file_space( goto out; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } return error; out: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + /* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with @@ -1599,13 +1660,6 @@ xfs_swap_extent_flush( /* Verify O_DIRECT for ftmp */ if (VFS_I(ip)->i_mapping->nrpages) return -EINVAL; - - /* - * Don't try to swap extents on mmap()d files because we can't lock - * out races against page faults safely. - */ - if (mapping_mapped(VFS_I(ip)->i_mapping)) - return -EBUSY; return 0; } @@ -1633,13 +1687,14 @@ xfs_swap_extents( } /* - * Lock up the inodes against other IO and truncate to begin with. - * Then we can ensure the inodes are flushed and have no page cache - * safely. Once we have done this we can take the ilocks and do the rest - * of the checks. + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL; + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { @@ -1666,8 +1721,16 @@ xfs_swap_extents( xfs_trans_cancel(tp, 0); goto out_unlock; } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || @@ -1720,9 +1783,6 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); - /* * Before we've swapped the forks, lets set the owners of the forks * appropriately. We have to do this as we are demand paging the btree @@ -1856,5 +1916,5 @@ xfs_swap_extents( out_trans_cancel: xfs_trans_cancel(tp, 0); - goto out_unlock; + goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 736429a72a12..af97d9a1dfb4 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 507d96a57ac7..092d652bc03d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -537,9 +537,9 @@ xfs_buf_item_push( /* has a previous flush failed due to IO errors? */ if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { xfs_warn(bp->b_target->bt_mount, -"Detected failing async write on buffer block 0x%llx. Retrying async write.", +"Failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 799e5a2d334d..e85a9519a5ae 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -84,7 +84,7 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 3ee186ac1093..338e50bbfd1e 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -131,7 +131,7 @@ xfs_error_report( { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller %pF", + "Internal error %s at line %d of file %s. Caller %pS", tag, linenum, filename, ra); xfs_stack_trace(); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 279a76e52791..c0394ed126fc 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp); /* * Macros to set EFSCORRUPTED & return/branch. */ -#define XFS_WANT_CORRUPTED_GOTO(x,l) \ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ error = -EFSCORRUPTED; \ goto l; \ } \ } -#define XFS_WANT_CORRUPTED_RETURN(x) \ +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ return -EFSCORRUPTED; \ } \ } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1f12ad0a8585..8121e75352ee 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -559,7 +559,7 @@ xfs_file_aio_write_checks( if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; @@ -569,21 +569,42 @@ xfs_file_aio_write_checks( * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ + spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { bool zero = false; + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from @@ -645,6 +666,8 @@ xfs_file_dio_aio_write( int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -685,10 +708,11 @@ xfs_file_dio_aio_write( goto out; count = iov_iter_count(from); pos = iocb->ki_pos; + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, pos + count - 1); + pos, end); if (ret) goto out; /* @@ -698,7 +722,7 @@ xfs_file_dio_aio_write( */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + count - 1) >> PAGE_CACHE_SHIFT); + end >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -715,8 +739,22 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, from, pos); + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } out: xfs_rw_iunlock(ip, iolock); @@ -822,6 +860,11 @@ xfs_file_write_iter( return ret; } +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + STATIC long xfs_file_fallocate( struct file *file, @@ -835,18 +878,21 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; + bool do_file_insert = 0; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~XFS_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -873,6 +919,27 @@ xfs_file_fallocate( error = xfs_collapse_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; } else { flags |= XFS_PREALLOC_SET; @@ -907,8 +974,19 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; } + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + out_unlock: xfs_iunlock(ip, iolock); return error; @@ -996,20 +1074,6 @@ xfs_file_mmap( return 0; } -/* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1385,6 +1449,55 @@ xfs_file_llseek( } } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, @@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = { }; static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, + .fault = xfs_filemap_fault, .map_pages = filemap_map_pages, - .page_mkwrite = xfs_vm_page_mkwrite, + .page_mkwrite = xfs_filemap_page_mkwrite, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a2e86e8a0fea..8f9f854376c6 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -322,7 +322,7 @@ xfs_filestream_lookup_ag( pip = xfs_filestream_get_parent(ip); if (!pip) - goto out; + return NULLAGNUMBER; mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); if (mru) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 74efe5b760dc..cb7e8a29dfb6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -637,12 +637,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + spin_lock(&mp->m_sb_lock); - cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; - cnt->freeino = mp->m_sb.sb_ifree; - cnt->allocino = mp->m_sb.sb_icount; spin_unlock(&mp->m_sb_lock); return 0; } @@ -692,14 +693,9 @@ xfs_reserve_blocks( * what to do. This means that the amount of free space can * change while we do this, so we need to retry if we end up * trying to reserve more space than is available. - * - * We also use the xfs_mod_incore_sb() interface so that we - * don't have to care about whether per cpu counter are - * enabled, disabled or even compiled in.... */ retry: spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, @@ -716,7 +712,8 @@ xfs_reserve_blocks( } else { __int64_t free; - free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); if (!free) goto out; /* ENOSPC and fdblks_delta = 0 */ @@ -755,8 +752,7 @@ xfs_reserve_blocks( * the extra reserve blocks from the reserve..... */ int error; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - fdblks_delta, 0); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); if (error == -ENOSPC) goto retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9771b7ef62ed..76a9f2783282 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -439,11 +439,11 @@ xfs_iget( *ipp = ip; /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) + * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); + xfs_setup_existing_inode(ip); return 0; out_error_or_again: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6163767aa856..d6ebc85192b7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -150,6 +160,8 @@ xfs_ilock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -159,6 +171,11 @@ xfs_ilock( else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -191,6 +208,8 @@ xfs_ilock_nowait( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -202,21 +221,35 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -244,6 +277,8 @@ xfs_iunlock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -254,6 +289,11 @@ xfs_iunlock( else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -271,11 +311,14 @@ xfs_ilock_demote( xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -294,6 +337,12 @@ xfs_isilocked( return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -314,14 +363,27 @@ int xfs_lock_delays; #endif /* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; @@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass) } /* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. */ void xfs_lock_inodes( @@ -348,30 +409,27 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - ASSERT(ips && (inodes >= 2)); /* we need at least two */ + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); try_lock = 0; i = 0; - again: for (; i < inodes; i++) { ASSERT(ips[i]); - if (i && (ips[i] == ips[i-1])) /* Already locked */ + if (i && (ips[i] == ips[i - 1])) /* Already locked */ continue; /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. */ - if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) try_lock++; - } } } @@ -381,51 +439,42 @@ xfs_lock_inodes( * we can't get any, we must release all we have * and try again. */ - - if (try_lock) { - /* try_lock must be 0 if i is 0. */ - /* - * try_lock means we have an inode locked - * that is in the AIL. - */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { + if (!try_lock) { xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; + + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { + /* + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. + */ + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; } #ifdef DEBUG @@ -440,10 +489,10 @@ xfs_lock_inodes( } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_two_inodes( @@ -455,8 +504,12 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -818,7 +871,7 @@ xfs_ialloc( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); *ipp = ip; @@ -1235,12 +1288,14 @@ xfs_create( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1345,12 +1400,14 @@ xfs_create_tmpfile( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -2611,19 +2668,22 @@ xfs_remove( /* * Enter all inodes for a rename transaction into a sorted array. */ +#define __XFS_SORT_INODES 5 STATIC void xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - xfs_inode_t *temp; int i, j; + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + /* * i_tab contains a list of pointers to inodes. We initialize * the table here & we'll sort it. We will then use it to @@ -2631,25 +2691,24 @@ xfs_sort_for_rename( * * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; /* * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) + * most 5 elements to sort, so this is adequate.) */ for (i = 0; i < *num_inodes; i++) { for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; + struct xfs_inode *temp = i_tab[j]; i_tab[j] = i_tab[j-1]; i_tab[j-1] = temp; } @@ -2657,6 +2716,31 @@ xfs_sort_for_rename( } } +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + /* * xfs_cross_rename() * @@ -2685,14 +2769,14 @@ xfs_cross_rename( ip2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* * If we're renaming one or more directories across different parents, @@ -2707,16 +2791,16 @@ xfs_cross_rename( dp1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(ip1->i_d.di_mode)) { error = xfs_droplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; } /* @@ -2734,16 +2818,16 @@ xfs_cross_rename( dp2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(ip2->i_d.di_mode)) { error = xfs_droplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; } /* @@ -2771,66 +2855,108 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); -out: + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } +/* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* Satisfy xfs_bumplink that this is a real tmpfile */ + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + /* * xfs_rename */ int xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip, - unsigned int flags) + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) { - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; trace_xfs_rename(src_dp, target_dp, src_name, target_name); - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Lock all the participating inodes. Depending upon whether @@ -2851,6 +2977,8 @@ xfs_rename( xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -2860,24 +2988,16 @@ xfs_rename( if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = -EXDEV; - goto error_return; + goto out_trans_cancel; } - /* - * Handle RENAME_EXCHANGE flags - */ - if (flags & RENAME_EXCHANGE) { - if (target_ip == NULL) { - error = -EINVAL; - goto error_return; - } - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); - if (error) - goto abort_return; - goto finish_rename; - } + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); /* * Set up the target. @@ -2890,7 +3010,7 @@ xfs_rename( if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * If target does not exist and the rename crosses @@ -2901,9 +3021,9 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error == -ENOSPC) - goto error_return; + goto out_bmap_cancel; if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2911,7 +3031,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto abort_return; + goto out_trans_abort; } } else { /* target_ip != NULL */ /* @@ -2926,7 +3046,7 @@ xfs_rename( if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = -EEXIST; - goto error_return; + goto out_trans_cancel; } } @@ -2943,7 +3063,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2954,7 +3074,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; if (src_is_directory) { /* @@ -2962,7 +3082,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; } } /* target_ip != NULL */ @@ -2979,7 +3099,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto abort_return; + goto out_trans_abort; } /* @@ -3005,49 +3125,67 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto abort_return; + goto out_trans_abort; } - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; + + /* + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. + */ + if (wip) { + ASSERT(wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; + } xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); -finish_rename: - /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; - } - - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - - abort_return: +out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_bmap_cancel: xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - std_return: + if (wip) + IRELE(wip); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a1cd55f3f351..8f22d20368d8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,6 +56,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ @@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ - | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* @@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_SHIFT 20 + #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) -#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 #define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) -#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) -#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) /* * For multiple groups support: if S_ISGID bit is set in the parent @@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ac4feae45eb3..5f4a396f5186 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -631,7 +631,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (ioflags & XFS_IO_INVIS) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -639,10 +639,13 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccb1dd0d509e..38e633bad8c2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -460,8 +460,7 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; + freesp = percpu_counter_read_positive(&mp->m_fdblocks); if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { shift = 2; if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e53a90331422..2f1839e4dd1b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -187,6 +187,8 @@ xfs_generic_create( else d_instantiate(dentry, inode); + xfs_finish_inode_setup(ip); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -195,6 +197,7 @@ xfs_generic_create( return error; out_cleanup_inode: + xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); @@ -367,9 +370,11 @@ xfs_vn_symlink( goto out_cleanup_inode; d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: @@ -389,7 +394,7 @@ xfs_vn_rename( struct xfs_name oname; struct xfs_name nname; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; /* if we are exchanging files, we need to set i_mode of both files */ @@ -766,6 +771,7 @@ xfs_setattr_size( return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); @@ -829,55 +835,27 @@ xfs_setattr_size( inode_dio_wait(inode); /* - * Do all the page cache truncate work outside the transaction context - * as the "lock" order is page lock->log space reservation. i.e. - * locking pages inside the transaction can ABBA deadlock with - * writeback. We have to do the VFS inode size update before we truncate - * the pagecache, however, to avoid racing with page faults beyond the - * new EOF they are not serialised against truncate operations except by - * page locks and size updates. + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. * - * Hence we are in a situation where a truncate can fail with ENOMEM - * from xfs_trans_reserve(), but having already truncated the in-memory - * version of the file (i.e. made user visible changes). There's not - * much we can do about this, except to hope that the caller sees ENOMEM - * and retries the truncate operation. + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. */ error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); - /* - * The "we can't serialise against page faults" pain gets worse. - * - * If the file is mapped then we have to clean the page at the old EOF - * when extending the file. Extending the file can expose changes the - * underlying page mapping (e.g. from beyond EOF to a hole or - * unwritten), and so on the next attempt to write to that page we need - * to remap it for write. i.e. we need .page_mkwrite() to be called. - * Hence we need to clean the page to clean the pte and so a new write - * fault will be triggered appropriately. - * - * If we do it before we change the inode size, then we can race with a - * page fault that maps the page with exactly the same problem. If we do - * it after we change the file size, then a new page fault can come in - * and allocate space before we've run the rest of the truncate - * transaction. That's kinda grotesque, but it's better than have data - * over a hole, and so that's the lesser evil that has been chosen here. - * - * The real solution, however, is to have some mechanism for locking out - * page faults while a truncate is in progress. - */ - if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { - error = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - round_down(oldsize, PAGE_CACHE_SIZE), - round_up(oldsize, PAGE_CACHE_SIZE) - 1); - if (error) - return error; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) @@ -975,9 +953,13 @@ xfs_vn_setattr( uint iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); - error = xfs_break_layouts(dentry->d_inode, &iolock); - if (!error) + error = xfs_break_layouts(dentry->d_inode, &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_setattr_size(ip, iattr); + } xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); @@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. + * Initialize the Linux inode and set up the operation vectors. * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. - * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1324,9 +1302,4 @@ xfs_setup_inode( inode_has_no_xattr(inode); cache_no_acl(inode); } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ea7a98e9cb70..a0f84abb0d09 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setup_inode(struct xfs_inode *); - /* * Internal setattr interfaces. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 82e314258f73..80429891dc9b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk( error = xfs_inobt_get_rec(cur, irec, &stat); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(stat == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); /* Check if the record contains the inode in request */ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c31d2c2eadc4..7c7842c85a08 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t; #undef XFS_NATIVE_HOST #endif -/* - * Feature macros (disable/enable) - */ -#ifdef CONFIG_SMP -#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#else -#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#endif - #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5a945fc3bdc..4f5784f85a5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4463,10 +4463,10 @@ xlog_do_recover( xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + xfs_buf_relse(bp); - /* We've re-read the superblock so re-initialize per-cpu counters */ - xfs_icsb_reinit_counters(log->l_mp); xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4fa80e63eea2..2ce7ee3b4ec1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,18 +43,6 @@ #include "xfs_sysfs.h" -#ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); -#else - -#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) -#endif - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -347,8 +335,7 @@ xfs_readsb( goto reread; } - /* Initialize per-cpu counters */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp) if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; - xfs_icsb_sync_counters(mp, 0); - /* * we don't need to do this if we are updating the superblock * counters on every modification. @@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } -/* - * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply - * a delta to a specified field in the in-core superblock. Simply - * switch on the field indicated and apply the delta to that field. - * Fields are not allowed to dip below zero, so if the delta would - * do this do not apply it and return EINVAL. - * - * The m_sb_lock must be held when this routine is called. - */ -STATIC int -xfs_mod_incore_sb_unlocked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) { - int scounter; /* short counter for 32 bit fields */ - long long lcounter; /* long counter for 64 bit fields */ - long long res_used, rem; - - /* - * With the in-core superblock spin lock held, switch - * on the indicated field. Apply the delta to the - * proper field. If the fields value would dip below - * 0, then do not apply the delta and return EINVAL. - */ - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = (long long)mp->m_sb.sb_icount; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_icount = lcounter; - return 0; - case XFS_SBS_IFREE: - lcounter = (long long)mp->m_sb.sb_ifree; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_ifree = lcounter; - return 0; - case XFS_SBS_FDBLOCKS: - lcounter = (long long) - mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (delta > 0) { /* Putting blocks back */ - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - rem = delta - res_used; - mp->m_resblks_avail = mp->m_resblks; - lcounter += rem; - } - } else { /* Taking blocks away */ - lcounter += delta; - if (lcounter >= 0) { - mp->m_sb.sb_fdblocks = lcounter + - XFS_ALLOC_SET_ASIDE(mp); - return 0; - } - - /* - * We are out of blocks, use any available reserved - * blocks if were allowed to. - */ - if (!rsvd) - return -ENOSPC; - - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; - return 0; - } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_fsname); - return -ENOSPC; - } - - mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - return 0; - case XFS_SBS_FREXTENTS: - lcounter = (long long)mp->m_sb.sb_frextents; - lcounter += delta; - if (lcounter < 0) { - return -ENOSPC; - } - mp->m_sb.sb_frextents = lcounter; - return 0; - case XFS_SBS_DBLOCKS: - lcounter = (long long)mp->m_sb.sb_dblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_dblocks = lcounter; - return 0; - case XFS_SBS_AGCOUNT: - scounter = mp->m_sb.sb_agcount; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_agcount = scounter; - return 0; - case XFS_SBS_IMAX_PCT: - scounter = mp->m_sb.sb_imax_pct; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_imax_pct = scounter; - return 0; - case XFS_SBS_REXTSIZE: - scounter = mp->m_sb.sb_rextsize; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextsize = scounter; - return 0; - case XFS_SBS_RBMBLOCKS: - scounter = mp->m_sb.sb_rbmblocks; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rbmblocks = scounter; - return 0; - case XFS_SBS_RBLOCKS: - lcounter = (long long)mp->m_sb.sb_rblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rblocks = lcounter; - return 0; - case XFS_SBS_REXTENTS: - lcounter = (long long)mp->m_sb.sb_rextents; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextents = lcounter; - return 0; - case XFS_SBS_REXTSLOG: - scounter = mp->m_sb.sb_rextslog; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextslog = scounter; - return 0; - default: + /* deltas are +/-64, hence the large batch size of 128. */ + __percpu_counter_add(&mp->m_icount, delta, 128); + if (percpu_counter_compare(&mp->m_icount, 0) < 0) { ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; } + return 0; } -/* - * xfs_mod_incore_sb() is used to change a field in the in-core - * superblock structure by the specified delta. This modification - * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() - * routine to do the work. - */ int -xfs_mod_incore_sb( +xfs_mod_ifree( + struct xfs_mount *mp, + int64_t delta) +{ + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; +} + +int +xfs_mod_fdblocks( struct xfs_mount *mp, - xfs_sb_field_t field, int64_t delta, - int rsvd) + bool rsvd) { - int status; + int64_t lcounter; + long long res_used; + s32 batch; -#ifdef HAVE_PERCPU_SB - ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); -#endif - spin_lock(&mp->m_sb_lock); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } - return status; -} + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); -/* - * Change more than one field in the in-core superblock structure at a time. - * - * The fields and changes to those fields are specified in the array of - * xfs_mod_sb structures passed in. Either all of the specified deltas - * will be applied or none of them will. If any modified field dips below 0, - * then all modifications will be backed out and EINVAL will be returned. - * - * Note that this function may not be used for the superblock values that - * are tracked with the in-memory per-cpu counters - a direct call to - * xfs_icsb_modify_counters is required for these. - */ -int -xfs_mod_incore_sb_batch( - struct xfs_mount *mp, - xfs_mod_sb_t *msb, - uint nmsb, - int rsvd) -{ - xfs_mod_sb_t *msbp; - int error = 0; + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } /* - * Loop through the array of mod structures and apply each individually. - * If any fail, then back out all those which have already been applied. - * Do all of this within the scope of the m_sb_lock so that all of the - * changes will be atomic. + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * batch size is set to a maximum of 1024 blocks - if we are + * allocating of freeing extents larger than this then we aren't + * going to be hammering the counter lock so a lock per update + * is not a problem. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ +#define __BATCH 1024 + if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) + batch = 1; + else + batch = __BATCH; +#undef __BATCH + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (percpu_counter_compare(&mp->m_fdblocks, + XFS_ALLOC_SET_ASIDE(mp)) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - for (msbp = msb; msbp < (msb + nmsb); msbp++) { - ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || - msbp->msb_field > XFS_SBS_FDBLOCKS); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - msbp->msb_delta, rsvd); - if (error) - goto unwind; + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: spin_unlock(&mp->m_sb_lock); - return 0; + return -ENOSPC; +} -unwind: - while (--msbp >= msb) { - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - -msbp->msb_delta, rsvd); - ASSERT(error == 0); - } +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; spin_unlock(&mp->m_sb_lock); - return error; + return ret; } /* @@ -1407,573 +1275,3 @@ xfs_dev_is_read_only( } return 0; } - -#ifdef HAVE_PERCPU_SB -/* - * Per-cpu incore superblock counters - * - * Simple concept, difficult implementation - * - * Basically, replace the incore superblock counters with a distributed per cpu - * counter for contended fields (e.g. free block count). - * - * Difficulties arise in that the incore sb is used for ENOSPC checking, and - * hence needs to be accurately read when we are running low on space. Hence - * there is a method to enable and disable the per-cpu counters based on how - * much "stuff" is available in them. - * - * Basically, a counter is enabled if there is enough free resource to justify - * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local - * ENOSPC), then we disable the counters to synchronise all callers and - * re-distribute the available resources. - * - * If, once we redistributed the available resources, we still get a failure, - * we disable the per-cpu counter and go through the slow path. - * - * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain its resources back to - * the global superblock. We do this after disabling the counter to prevent - * more threads from queueing up on the counter. - * - * Essentially, this means that we still need a lock in the fast path to enable - * synchronisation between the global counters and the per-cpu counters. This - * is not a problem because the lock will be local to a CPU almost all the time - * and have little contention except when we get to ENOSPC conditions. - * - * Basically, this lock becomes a barrier that enables us to lock out the fast - * path while we do things like enabling and disabling counters and - * synchronising the counters. - * - * Locking rules: - * - * 1. m_sb_lock before picking up per-cpu locks - * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires m_sb_lock + per cpu locks - * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding m_sb_lock - * 6. enabling or disabling a counter requires holding the m_sb_lock - * and _none_ of the per-cpu locks. - * - * Disabled counters are only ever re-enabled by a balance operation - * that results in more free resources per CPU than a given threshold. - * To ensure counters don't remain disabled, they are rebalanced when - * the global resource goes above a higher threshold (i.e. some hysteresis - * is present to prevent thrashing). - */ - -#ifdef CONFIG_HOTPLUG_CPU -/* - * hot-plug CPU notifier support. - * - * We need a notifier per filesystem as we need to be able to identify - * the filesystem to balance the counters out. This is achieved by - * having a notifier block embedded in the xfs_mount_t and doing pointer - * magic to get the mount pointer from the notifier block address. - */ -STATIC int -xfs_icsb_cpu_notify( - struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - xfs_icsb_cnts_t *cntp; - xfs_mount_t *mp; - - mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); - cntp = (xfs_icsb_cnts_t *) - per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - /* Easy Case - initialize the area and locks, and - * then rebalance when online does everything else for us. */ - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - xfs_icsb_lock(mp); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* Disable all the counters, then fold the dead cpu's - * count into the total on the global superblock and - * re-enable the counters. */ - xfs_icsb_lock(mp); - spin_lock(&mp->m_sb_lock); - xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); - xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); - xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); - - mp->m_sb.sb_icount += cntp->icsb_icount; - mp->m_sb.sb_ifree += cntp->icsb_ifree; - mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; - - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - - xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); - spin_unlock(&mp->m_sb_lock); - xfs_icsb_unlock(mp); - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -int -xfs_icsb_init_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); - if (mp->m_sb_cnts == NULL) - return -ENOMEM; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - } - - mutex_init(&mp->m_icsb_mutex); - - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - - return 0; -} - -void -xfs_icsb_reinit_counters( - xfs_mount_t *mp) -{ - xfs_icsb_lock(mp); - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); -} - -void -xfs_icsb_destroy_counters( - xfs_mount_t *mp) -{ - if (mp->m_sb_cnts) { - unregister_hotcpu_notifier(&mp->m_icsb_notifier); - free_percpu(mp->m_sb_cnts); - } - mutex_destroy(&mp->m_icsb_mutex); -} - -STATIC void -xfs_icsb_lock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { - ndelay(1000); - } -} - -STATIC void -xfs_icsb_unlock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); -} - - -STATIC void -xfs_icsb_lock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_lock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_unlock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_unlock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_count( - xfs_mount_t *mp, - xfs_icsb_cnts_t *cnt, - int flags) -{ - xfs_icsb_cnts_t *cntp; - int i; - - memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_lock_all_counters(mp); - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - cnt->icsb_icount += cntp->icsb_icount; - cnt->icsb_ifree += cntp->icsb_ifree; - cnt->icsb_fdblocks += cntp->icsb_fdblocks; - } - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_unlock_all_counters(mp); -} - -STATIC int -xfs_icsb_counter_disabled( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - return test_bit(field, &mp->m_icsb_counters); -} - -STATIC void -xfs_icsb_disable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - xfs_icsb_cnts_t cnt; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - /* - * If we are already disabled, then there is nothing to do - * here. We check before locking all the counters to avoid - * the expensive lock operation when being called in the - * slow path and the counter is already disabled. This is - * safe because the only time we set or clear this state is under - * the m_icsb_mutex. - */ - if (xfs_icsb_counter_disabled(mp, field)) - return; - - xfs_icsb_lock_all_counters(mp); - if (!test_and_set_bit(field, &mp->m_icsb_counters)) { - /* drain back to superblock */ - - xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); - switch(field) { - case XFS_SBS_ICOUNT: - mp->m_sb.sb_icount = cnt.icsb_icount; - break; - case XFS_SBS_IFREE: - mp->m_sb.sb_ifree = cnt.icsb_ifree; - break; - case XFS_SBS_FDBLOCKS: - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - break; - default: - BUG(); - } - } - - xfs_icsb_unlock_all_counters(mp); -} - -STATIC void -xfs_icsb_enable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field, - uint64_t count, - uint64_t resid) -{ - xfs_icsb_cnts_t *cntp; - int i; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - xfs_icsb_lock_all_counters(mp); - for_each_online_cpu(i) { - cntp = per_cpu_ptr(mp->m_sb_cnts, i); - switch (field) { - case XFS_SBS_ICOUNT: - cntp->icsb_icount = count + resid; - break; - case XFS_SBS_IFREE: - cntp->icsb_ifree = count + resid; - break; - case XFS_SBS_FDBLOCKS: - cntp->icsb_fdblocks = count + resid; - break; - default: - BUG(); - break; - } - resid = 0; - } - clear_bit(field, &mp->m_icsb_counters); - xfs_icsb_unlock_all_counters(mp); -} - -void -xfs_icsb_sync_counters_locked( - xfs_mount_t *mp, - int flags) -{ - xfs_icsb_cnts_t cnt; - - xfs_icsb_count(mp, &cnt, flags); - - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) - mp->m_sb.sb_icount = cnt.icsb_icount; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) - mp->m_sb.sb_ifree = cnt.icsb_ifree; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; -} - -/* - * Accurate update of per-cpu counters to incore superblock - */ -void -xfs_icsb_sync_counters( - xfs_mount_t *mp, - int flags) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, flags); - spin_unlock(&mp->m_sb_lock); -} - -/* - * Balance and enable/disable counters as necessary. - * - * Thresholds for re-enabling counters are somewhat magic. inode counts are - * chosen to be the same number as single on disk allocation chunk per CPU, and - * free blocks is something far enough zero that we aren't going thrash when we - * get near ENOSPC. We also need to supply a minimum we require per cpu to - * prevent looping endlessly when xfs_alloc_space asks for more than will - * be distributed to a single CPU but each CPU has enough blocks to be - * reenabled. - * - * Note that we can be called when counters are already disabled. - * xfs_icsb_disable_counter() optimises the counter locking in this case to - * prevent locking every per-cpu counter needlessly. - */ - -#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 -#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) -STATIC void -xfs_icsb_balance_counter_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int min_per_cpu) -{ - uint64_t count, resid; - int weight = num_online_cpus(); - uint64_t min = (uint64_t)min_per_cpu; - - /* disable counter and sync counter */ - xfs_icsb_disable_counter(mp, field); - - /* update counters - first CPU gets residual*/ - switch (field) { - case XFS_SBS_ICOUNT: - count = mp->m_sb.sb_icount; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_IFREE: - count = mp->m_sb.sb_ifree; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_FDBLOCKS: - count = mp->m_sb.sb_fdblocks; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) - return; - break; - default: - BUG(); - count = resid = 0; /* quiet, gcc */ - break; - } - - xfs_icsb_enable_counter(mp, field, count, resid); -} - -STATIC void -xfs_icsb_balance_counter( - xfs_mount_t *mp, - xfs_sb_field_t fields, - int min_per_cpu) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); - spin_unlock(&mp->m_sb_lock); -} - -int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) -{ - xfs_icsb_cnts_t *icsbp; - long long lcounter; /* long counter for 64 bit fields */ - int ret = 0; - - might_sleep(); -again: - preempt_disable(); - icsbp = this_cpu_ptr(mp->m_sb_cnts); - - /* - * if the counter is disabled, go to slow path - */ - if (unlikely(xfs_icsb_counter_disabled(mp, field))) - goto slow_path; - xfs_icsb_lock_cntr(icsbp); - if (unlikely(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock_cntr(icsbp); - goto slow_path; - } - - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = icsbp->icsb_icount; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_icount = lcounter; - break; - - case XFS_SBS_IFREE: - lcounter = icsbp->icsb_ifree; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_ifree = lcounter; - break; - - case XFS_SBS_FDBLOCKS: - BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); - - lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - break; - default: - BUG(); - break; - } - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - return 0; - -slow_path: - preempt_enable(); - - /* - * serialise with a mutex so we don't burn lots of cpu on - * the superblock lock. We still need to hold the superblock - * lock, however, when we modify the global structures. - */ - xfs_icsb_lock(mp); - - /* - * Now running atomically. - * - * If the counter is enabled, someone has beaten us to rebalancing. - * Drop the lock and try again in the fast path.... - */ - if (!(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock(mp); - goto again; - } - - /* - * The counter is currently disabled. Because we are - * running atomically here, we know a rebalance cannot - * be in progress. Hence we can go straight to operating - * on the global superblock. We do not call xfs_mod_incore_sb() - * here even though we need to get the m_sb_lock. Doing so - * will cause us to re-enter this function and deadlock. - * Hence we get the m_sb_lock ourselves and then call - * xfs_mod_incore_sb_unlocked() as the unlocked path operates - * directly on the global counters. - */ - spin_lock(&mp->m_sb_lock); - ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - /* - * Now that we've modified the global superblock, we - * may be able to re-enable the distributed counters - * (e.g. lots of space just got freed). After that - * we are done. - */ - if (ret != -ENOSPC) - xfs_icsb_balance_counter(mp, field, 0); - xfs_icsb_unlock(mp); - return ret; - -balance_counter: - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - - /* - * We may have multiple threads here if multiple per-cpu - * counters run dry at the same time. This will mean we can - * do more balances than strictly necessary but it is not - * the common slowpath case. - */ - xfs_icsb_lock(mp); - - /* - * running atomically. - * - * This will leave the counter in the correct state for future - * accesses. After the rebalance, we simply try again and our retry - * will either succeed through the fast path or slow path without - * another balance operation being required. - */ - xfs_icsb_balance_counter(mp, field, delta); - xfs_icsb_unlock(mp); - goto again; -} - -#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0d8abd6364d9..8c995a2ccb6f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,8 +18,6 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -#ifdef __KERNEL__ - struct xlog; struct xfs_inode; struct xfs_mru_cache; @@ -29,44 +27,6 @@ struct xfs_quotainfo; struct xfs_dir_ops; struct xfs_da_geometry; -#ifdef HAVE_PERCPU_SB - -/* - * Valid per-cpu incore superblock counters. Note that if you add new counters, - * you may need to define new counter disabled bit field descriptors as there - * are more possible fields in the superblock that can fit in a bitfield on a - * 32 bit platform. The XFS_SBS_* values for the current current counters just - * fit. - */ -typedef struct xfs_icsb_cnts { - uint64_t icsb_fdblocks; - uint64_t icsb_ifree; - uint64_t icsb_icount; - unsigned long icsb_flags; -} xfs_icsb_cnts_t; - -#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ - -#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ - -extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_reinit_counters(struct xfs_mount *); -extern void xfs_icsb_destroy_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters(struct xfs_mount *, int); -extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); -extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, - int64_t, int); - -#else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_reinit_counters(mp) do { } while (0) -#define xfs_icsb_sync_counters(mp, flags) do { } while (0) -#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) -#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ - xfs_mod_incore_sb(mp, field, delta, rsvd) -#endif - /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { XFS_LOWSP_1_PCNT = 0, @@ -81,8 +41,13 @@ typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ struct xfs_ail *m_ail; /* fs active log item list */ - xfs_sb_t m_sb; /* copy of fs superblock */ + + struct xfs_sb m_sb; /* copy of fs superblock */ spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -152,12 +117,6 @@ typedef struct xfs_mount { const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ -#ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ - unsigned long m_icsb_counters; /* disabled per-cpu counters */ - struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ - struct mutex m_icsb_mutex; /* balancer sync lock */ -#endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks @@ -300,35 +259,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -/* - * Per-cpu superblock locking functions - */ -#ifdef HAVE_PERCPU_SB -static inline void -xfs_icsb_lock(xfs_mount_t *mp) -{ - mutex_lock(&mp->m_icsb_mutex); -} - -static inline void -xfs_icsb_unlock(xfs_mount_t *mp) -{ - mutex_unlock(&mp->m_icsb_mutex); -} -#else -#define xfs_icsb_lock(mp) -#define xfs_icsb_unlock(mp) -#endif - -/* - * This structure is for use by the xfs_mod_incore_sb_batch() routine. - * xfs_growfs can specify a few fields which are more than int limit - */ -typedef struct xfs_mod_sb { - xfs_sb_field_t msb_field; /* Field to modify, see below */ - int64_t msb_delta; /* Change to make to specified field */ -} xfs_mod_sb_t; - /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); - extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, - uint, int); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); @@ -399,6 +332,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); -#endif /* __KERNEL__ */ - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 30ecca3037e3..f8a674d7f092 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -437,7 +437,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_KERNEL)) + if (radix_tree_preload(GFP_NOFS)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 365dd57ea760..981a657eca39 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,7 +31,8 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool with_imutex) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -40,8 +41,12 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index b7fbfce660f6..8147ac108820 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); #else -static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fbbb9e62e274..5538468c7f63 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; int committed; + bool need_alloc = true; *ip = NULL; /* @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; } } @@ -758,7 +760,7 @@ xfs_qm_qino_alloc( return error; } - if (!*ip) { + if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { @@ -794,11 +796,14 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); - return error; } - return 0; + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8fcc4ccc5c79..5f357ca97e76 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ @@ -361,28 +359,10 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - xfs_warn(mp, - "delaylog is the default now, option is deprecated."); - } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - xfs_warn(mp, - "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, "ihashsize")) { - xfs_warn(mp, - "ihashsize no longer used, option is deprecated."); - } else if (!strcmp(this_char, "osyncisdsync")) { - xfs_warn(mp, - "osyncisdsync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "osyncisosync")) { - xfs_warn(mp, - "osyncisosync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - xfs_warn(mp, - "irixsgid is now a sysctl(2) variable, option is deprecated."); } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -986,6 +966,8 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } @@ -1033,23 +1015,6 @@ xfs_free_fsname( kfree(mp->m_logname); } -STATIC void -xfs_fs_put_super( - struct super_block *sb) -{ - struct xfs_mount *mp = XFS_M(sb); - - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); - - xfs_freesb(mp); - xfs_icsb_destroy_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - xfs_free_fsname(mp); - kfree(mp); -} - STATIC int xfs_fs_sync_fs( struct super_block *sb, @@ -1085,6 +1050,9 @@ xfs_fs_statfs( xfs_sb_t *sbp = &mp->m_sb; struct xfs_inode *ip = XFS_I(dentry->d_inode); __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; xfs_extlen_t lsize; __int64_t ffree; @@ -1095,17 +1063,21 @@ xfs_fs_statfs( statp->f_fsid.val[0] = (u32)id; statp->f_fsid.val[1] = (u32)(id >> 32); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); spin_lock(&mp->m_sb_lock); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1117,10 +1089,9 @@ xfs_fs_statfs( sbp->sb_icount); /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + ffree = statp->f_files - (icount - ifree); statp->f_ffree = max_t(__int64_t, ffree, 0); - spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == @@ -1256,6 +1227,12 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1401,6 +1378,51 @@ xfs_finish_flags( return 0; } +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + STATIC int xfs_fs_fill_super( struct super_block *sb, @@ -1449,7 +1471,7 @@ xfs_fs_fill_super( if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = xfs_init_percpu_counters(mp); if (error) goto out_destroy_workqueues; @@ -1507,7 +1529,7 @@ xfs_fs_fill_super( out_free_sb: xfs_freesb(mp); out_destroy_counters: - xfs_icsb_destroy_counters(mp); + xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: @@ -1524,6 +1546,24 @@ xfs_fs_fill_super( goto out_free_sb; } +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + STATIC struct dentry * xfs_fs_mount( struct file_system_type *fs_type, diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2b830c2f322e..499058fea303 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 25791df6f638..3df411eadb86 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -177,7 +177,7 @@ xfs_symlink( int pathlen; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -221,7 +221,7 @@ xfs_symlink( XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) - goto std_return; + return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; @@ -241,7 +241,7 @@ xfs_symlink( } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); @@ -252,7 +252,7 @@ xfs_symlink( */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; - goto error_return; + goto out_trans_cancel; } /* @@ -261,7 +261,7 @@ xfs_symlink( error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; /* * Check for ability to enter directory entry, if no space reserved. @@ -269,7 +269,7 @@ xfs_symlink( if (!resblks) { error = xfs_dir_canenter(tp, dp, link_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * Initialize the bmap freelist prior to calling either @@ -282,15 +282,14 @@ xfs_symlink( */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto error_return; - goto error1; - } + if (error) + goto out_trans_cancel; /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); @@ -330,7 +329,7 @@ xfs_symlink( XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, &free_list); if (error) - goto error2; + goto out_bmap_cancel; if (resblks) resblks -= fs_blocks; @@ -348,7 +347,7 @@ xfs_symlink( BTOBB(byte_cnt), 0); if (!bp) { error = -ENOMEM; - goto error2; + goto out_bmap_cancel; } bp->b_ops = &xfs_symlink_buf_ops; @@ -378,7 +377,7 @@ xfs_symlink( error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, &first_block, &free_list, resblks); if (error) - goto error2; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -392,10 +391,13 @@ xfs_symlink( } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -403,20 +405,28 @@ xfs_symlink( *ipp = ip; return 0; - error2: - IRELE(ip); - error1: +out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e34d988..615781bf4ee5 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pf", + TP_printk("dev %d:%d agno %u refcount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %pf", + "lock %d error %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %pf", + "flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %pf", + "freeblks %u longest %u caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index eb90cd59a0ec..220ef2c906b2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -173,7 +173,7 @@ xfs_trans_reserve( uint rtextents) { int error = 0; - int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); @@ -184,8 +184,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return -ENOSPC; @@ -236,8 +235,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -((int64_t)rtextents), rsvd); + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -268,8 +266,7 @@ xfs_trans_reserve( undo_blocks: if (blocks > 0) { - xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - (int64_t)blocks, rsvd); + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); tp->t_blk_res = 0; } @@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + /* * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations * and apply superblock counter changes to the in-core superblock. The @@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas( * applied to the in-core superblock. The idea is that that has already been * done. * - * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). - * However, we have to ensure that we only modify each superblock field only - * once because the application of the delta values may not be atomic. That can - * lead to ENOSPC races occurring if we have two separate modifcations of the - * free space counter to put back the entire reservation and then take away - * what we used. - * * If we are not logging superblock counters, then the inode allocated/free and * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we @@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas( */ void xfs_trans_unreserve_and_mod_sb( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ - xfs_mod_sb_t *msbp; - xfs_mount_t *mp = tp->t_mountp; - /* REFERENCED */ - int error; - int rsvd; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; - int64_t idelta = 0; - int64_t ifreedelta = 0; - - msbp = msb; - rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; /* calculate deltas */ if (tp->t_blk_res > 0) @@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - blkdelta, rsvd); + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); if (error) goto out; } if (idelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, - idelta, rsvd); + error = xfs_mod_icount(mp, idelta); if (error) goto out_undo_fdblocks; } if (ifreedelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, - ifreedelta, rsvd); + error = xfs_mod_ifree(mp, ifreedelta); if (error) goto out_undo_icount; } + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + /* apply remaining deltas */ - if (rtxdelta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = rtxdelta; - msbp++; - } - - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_dblocks_delta != 0) { - msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = tp->t_dblocks_delta; - msbp++; - } - if (tp->t_agcount_delta != 0) { - msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = tp->t_agcount_delta; - msbp++; - } - if (tp->t_imaxpct_delta != 0) { - msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = tp->t_imaxpct_delta; - msbp++; - } - if (tp->t_rextsize_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = tp->t_rextsize_delta; - msbp++; - } - if (tp->t_rbmblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = tp->t_rbmblocks_delta; - msbp++; - } - if (tp->t_rblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = tp->t_rblocks_delta; - msbp++; - } - if (tp->t_rextents_delta != 0) { - msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = tp->t_rextents_delta; - msbp++; - } - if (tp->t_rextslog_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = tp->t_rextslog_delta; - msbp++; - } - } - - /* - * If we need to change anything, do it. - */ - if (msbp > msb) { - error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, - (uint)(msbp - msb), rsvd); + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); if (error) - goto out_undo_ifreecount; + goto out_undo_ifree; } + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; + } + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); + if (error) + goto out_undo_dblocks; + } + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); return; -out_undo_ifreecount: +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); if (ifreedelta) - xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); + xfs_mod_ifree(mp, -ifreedelta); out_undo_icount: if (idelta) - xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); + xfs_mod_icount(mp, -idelta); out_undo_fdblocks: if (blkdelta) - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); + xfs_mod_fdblocks(mp, -blkdelta, rsvd); out: ASSERT(error == 0); return; diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 31591686ac2d..996111000a8c 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -21,4 +21,10 @@ struct space_resv { #define FS_IOC_RESVSP _IOW('X', 40, struct space_resv) #define FS_IOC_RESVSP64 _IOW('X', 42, struct space_resv) +#define FALLOC_FL_SUPPORTED_MASK (FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + #endif /* _FALLOC_H_ */ diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h index d1197ae3723c..3e445a760f14 100644 --- a/include/uapi/linux/falloc.h +++ b/include/uapi/linux/falloc.h @@ -41,4 +41,21 @@ */ #define FALLOC_FL_ZERO_RANGE 0x10 +/* + * FALLOC_FL_INSERT_RANGE is use to insert space within the file size without + * overwriting any existing data. The contents of the file beyond offset are + * shifted towards right by len bytes to create a hole. As such, this + * operation will increase the size of the file by len bytes. + * + * Different filesystems may implement different limitations on the granularity + * of the operation. Most will limit operations to filesystem block size + * boundaries, but this boundary may be larger or smaller depending on + * the filesystem and/or the configuration of the filesystem or file. + * + * Attempting to insert space using this flag at OR beyond the end of + * the file is considered an illegal operation - just use ftruncate(2) or + * fallocate(2) with mode 0 for such type of operations. + */ +#define FALLOC_FL_INSERT_RANGE 0x20 + #endif /* _UAPI_FALLOC_H_ */