diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 66a36befc5c0..844e288b9576 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { + unsigned noio_flag = 0; void *ptr; + gfp_t lflags; ptr = kmem_zalloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; - return vzalloc(size); + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; } void diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ef62c6b6130a..98016b39b8af 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -632,38 +632,46 @@ xfs_map_at_offset( } /* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. */ -STATIC int +STATIC bool xfs_check_page_type( struct page *page, - unsigned int type) + unsigned int type, + bool check_all_buffers) { + struct buffer_head *bh; + struct buffer_head *head; + if (PageWriteback(page)) - return 0; + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC); + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE); + return true; + } - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable += (type == XFS_IO_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable += (type == XFS_IO_DELALLOC); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable += (type == XFS_IO_OVERWRITE); - else - break; - } while ((bh = bh->b_this_page) != head); + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); - if (acceptable) - return 1; - } - - return 0; + return false; } /* @@ -697,7 +705,7 @@ xfs_convert_page( goto fail_unlock_page; if (page->mapping != inode->i_mapping) goto fail_unlock_page; - if (!xfs_check_page_type(page, (*ioendp)->io_type)) + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) goto fail_unlock_page; /* @@ -742,6 +750,15 @@ xfs_convert_page( p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ bh = head = page_buffers(page); do { if (offset >= end_offset) @@ -750,7 +767,7 @@ xfs_convert_page( uptodate = 0; if (!(PageUptodate(page) || buffer_uptodate(bh))) { done = 1; - continue; + break; } if (buffer_unwritten(bh) || buffer_delay(bh) || @@ -762,10 +779,11 @@ xfs_convert_page( else type = XFS_IO_OVERWRITE; - if (!xfs_imap_valid(inode, imap, offset)) { - done = 1; - continue; - } + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); lock_buffer(bh); if (type != XFS_IO_OVERWRITE) @@ -777,6 +795,7 @@ xfs_convert_page( count++; } else { done = 1; + break; } } while (offset += len, (bh = bh->b_this_page) != head); @@ -868,7 +887,7 @@ xfs_aops_discard_page( struct buffer_head *bh, *head; loff_t offset = page_offset(page); - if (!xfs_check_page_type(page, XFS_IO_DELALLOC)) + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9c061ef2b0d9..107f2fdfe41f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -396,7 +396,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; + unsigned noio_flag; + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -404,6 +414,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); if (!bp->b_addr) return -ENOMEM; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 5959b3b4c7c9..8f711db61a0c 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc( args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; } else args.fsbno = NULLFSBLOCK; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f96c05669a9e..993cb19e7d39 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,6 +314,9 @@ xfs_readsb( error = bp->b_error; if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto release_buf; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 14e58f2c96bd..5fda18919d3b 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -80,6 +80,10 @@ xfs_readlink_bmap( if (error) { xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; goto out; } byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 647b6f1d8923..b8eef0549f3f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -275,6 +275,10 @@ xfs_trans_read_buf_map( XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -338,6 +342,9 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } } @@ -375,6 +382,10 @@ xfs_trans_read_buf_map( if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index 8515b0449dc8..d2c8e4a6ee2a 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -81,20 +81,28 @@ xfs_calc_buf_res( * on disk. Hence we need an inode reservation function that calculates all this * correctly. So, we log: * - * - log op headers for object + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks * - inode log format object - * - the entire inode contents (core + 2 forks) - * - two bmap btree block headers + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. */ STATIC uint xfs_calc_inode_res( struct xfs_mount *mp, uint ninodes) { - return ninodes * (sizeof(struct xlog_op_header) + - sizeof(struct xfs_inode_log_format) + - mp->m_sb.sb_inodesize + - 2 * XFS_BMBT_BLOCK_LEN(mp)); + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); } /*