linux-stable/fs/netfs/direct_write.c

// SPDX-License-Identifier: GPL-2.0-or-later
/* Unbuffered and direct write support.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/uio.h>
#include "internal.h"

static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
{
	struct inode *inode = wreq->inode;
	unsigned long long end = wreq->start + wreq->len;

	if (!wreq->error &&
	    i_size_read(inode) < end) {
		if (wreq->netfs_ops->update_i_size)
			wreq->netfs_ops->update_i_size(inode, end);
		else
			i_size_write(inode, end);
	}
}

/*
 * Perform an unbuffered write where we may have to do an RMW operation on an
 * encrypted file.  This can also be used for direct I/O writes.
 */
static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
						  struct netfs_group *netfs_group)
{
	struct netfs_io_request *wreq;
	unsigned long long start = iocb->ki_pos;
	unsigned long long end = start + iov_iter_count(iter);
	ssize_t ret, n;
	bool async = !is_sync_kiocb(iocb);

	_enter("");

	/* We're going to need a bounce buffer if what we transmit is going to
	 * be different in some way to the source buffer, e.g. because it gets
	 * encrypted/compressed or because it needs expanding to a block size.
	 */
	// TODO

	_debug("uw %llx-%llx", start, end);

	wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
				   start, end - start,
				   iocb->ki_flags & IOCB_DIRECT ?
				   NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
	if (IS_ERR(wreq))
		return PTR_ERR(wreq);

	{
		/* If this is an async op and we're not using a bounce buffer,
		 * we have to save the source buffer as the iterator is only
		 * good until we return.  In such a case, extract an iterator
		 * to represent as much of the the output buffer as we can
		 * manage.  Note that the extraction might not be able to
		 * allocate a sufficiently large bvec array and may shorten the
		 * request.
		 */
		if (async || user_backed_iter(iter)) {
			n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
			if (n < 0) {
				ret = n;
				goto out;
			}
			wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
			wreq->direct_bv_count = n;
			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
			wreq->len = iov_iter_count(&wreq->iter);
		} else {
			wreq->iter = *iter;
		}

		wreq->io_iter = wreq->iter;
	}

	/* Copy the data into the bounce buffer and encrypt it. */
	// TODO

	/* Dispatch the write. */
	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
	if (async)
		wreq->iocb = iocb;
	wreq->cleanup = netfs_cleanup_dio_write;
	ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
				iocb->ki_flags & IOCB_DIRECT ?
				netfs_write_trace_dio_write :
				netfs_write_trace_unbuffered_write);
	if (ret < 0) {
		_debug("begin = %zd", ret);
		goto out;
	}

	if (!async) {
		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
			    TASK_UNINTERRUPTIBLE);

		ret = wreq->error;
		_debug("waited = %zd", ret);
		if (ret == 0) {
			ret = wreq->transferred;
			iocb->ki_pos += ret;
		}
	} else {
		ret = -EIOCBQUEUED;
	}

out:
	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
	return ret;
}

/**
 * netfs_unbuffered_write_iter - Unbuffered write to a file
 * @iocb: IO state structure
 * @from: iov_iter with data to write
 *
 * Do an unbuffered write to a file, writing the data directly to the server
 * and not lodging the data in the pagecache.
 *
 * Return:
 * * Negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * Number of bytes written, even for truncated writes
 */
ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
	struct netfs_inode *ictx = netfs_inode(inode);
	unsigned long long end;
	ssize_t ret;

	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));

	if (!iov_iter_count(from))
		return 0;

	trace_netfs_write_iter(iocb, from);
	netfs_stat(&netfs_n_rh_dio_write);

	ret = netfs_start_io_direct(inode);
	if (ret < 0)
		return ret;
	ret = generic_write_checks(iocb, from);
	if (ret <= 0)
		goto out;
	ret = file_remove_privs(file);
	if (ret < 0)
		goto out;
	ret = file_update_time(file);
	if (ret < 0)
		goto out;
	ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
	if (ret < 0)
		goto out;
	end = iocb->ki_pos + iov_iter_count(from);
	if (end > ictx->zero_point)
		ictx->zero_point = end;

	fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
			   FSCACHE_INVAL_DIO_WRITE);
	ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
out:
	netfs_end_io_direct(inode);
	return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter);
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00			`// SPDX-License-Identifier: GPL-2.0-or-later`
			`/* Unbuffered and direct write support.`
			`*`
			`* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.`
			`* Written by David Howells (dhowells@redhat.com)`
			`*/`

			`#include <linux/export.h>`
			`#include <linux/uio.h>`
			`#include "internal.h"`

			`static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)`
			`{`
			`struct inode *inode = wreq->inode;`
			`unsigned long long end = wreq->start + wreq->len;`

			`if (!wreq->error &&`
			`i_size_read(inode) < end) {`
			`if (wreq->netfs_ops->update_i_size)`
			`wreq->netfs_ops->update_i_size(inode, end);`
			`else`
			`i_size_write(inode, end);`
			`}`
			`}`

			`/*`
			`* Perform an unbuffered write where we may have to do an RMW operation on an`
			`* encrypted file. This can also be used for direct I/O writes.`
			`*/`
netfs: Mark netfs_unbuffered_write_iter_locked() static Mark netfs_unbuffered_write_iter_locked() static as it's only called from the file in which it is defined. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2024-01-05 14:57:14 +00:00			`static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb iocb, struct iov_iter iter,`
			`struct netfs_group *netfs_group)`
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00			`{`
			`struct netfs_io_request *wreq;`
			`unsigned long long start = iocb->ki_pos;`
			`unsigned long long end = start + iov_iter_count(iter);`
			`ssize_t ret, n;`
			`bool async = !is_sync_kiocb(iocb);`

			`_enter("");`

			`/* We're going to need a bounce buffer if what we transmit is going to`
			`* be different in some way to the source buffer, e.g. because it gets`
			`* encrypted/compressed or because it needs expanding to a block size.`
			`*/`
			`// TODO`

			`_debug("uw %llx-%llx", start, end);`

			`wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,`
			`start, end - start,`
			`iocb->ki_flags & IOCB_DIRECT ?`
			`NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);`
			`if (IS_ERR(wreq))`
			`return PTR_ERR(wreq);`

			`{`
			`/* If this is an async op and we're not using a bounce buffer,`
			`* we have to save the source buffer as the iterator is only`
			`* good until we return. In such a case, extract an iterator`
			`* to represent as much of the the output buffer as we can`
			`* manage. Note that the extraction might not be able to`
			`* allocate a sufficiently large bvec array and may shorten the`
			`* request.`
			`*/`
			`if (async \|\| user_backed_iter(iter)) {`
			`n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);`
			`if (n < 0) {`
			`ret = n;`
			`goto out;`
			`}`
			`wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;`
			`wreq->direct_bv_count = n;`
			`wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);`
			`wreq->len = iov_iter_count(&wreq->iter);`
			`} else {`
			`wreq->iter = *iter;`
			`}`

			`wreq->io_iter = wreq->iter;`
			`}`

			`/* Copy the data into the bounce buffer and encrypt it. */`
			`// TODO`

			`/* Dispatch the write. */`
			`__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);`
			`if (async)`
			`wreq->iocb = iocb;`
			`wreq->cleanup = netfs_cleanup_dio_write;`
			`ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),`
			`iocb->ki_flags & IOCB_DIRECT ?`
			`netfs_write_trace_dio_write :`
			`netfs_write_trace_unbuffered_write);`
			`if (ret < 0) {`
			`_debug("begin = %zd", ret);`
			`goto out;`
			`}`

			`if (!async) {`
			`trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);`
			`wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,`
			`TASK_UNINTERRUPTIBLE);`

			`ret = wreq->error;`
			`_debug("waited = %zd", ret);`
			`if (ret == 0) {`
			`ret = wreq->transferred;`
			`iocb->ki_pos += ret;`
			`}`
			`} else {`
			`ret = -EIOCBQUEUED;`
			`}`

			`out:`
			`netfs_put_request(wreq, false, netfs_rreq_trace_put_return);`
			`return ret;`
			`}`

			`/**`
			`* netfs_unbuffered_write_iter - Unbuffered write to a file`
			`* @iocb: IO state structure`
			`* @from: iov_iter with data to write`
			`*`
			`* Do an unbuffered write to a file, writing the data directly to the server`
			`* and not lodging the data in the pagecache.`
			`*`
			`* Return:`
			`* * Negative error code if no data has been written at all of`
			`* vfs_fsync_range() failed for a synchronous write`
			`* * Number of bytes written, even for truncated writes`
			`*/`
			`ssize_t netfs_unbuffered_write_iter(struct kiocb iocb, struct iov_iter from)`
			`{`
			`struct file *file = iocb->ki_filp;`
			`struct inode *inode = file->f_mapping->host;`
			`struct netfs_inode *ictx = netfs_inode(inode);`
netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2023-11-24 13:39:02 +00:00			`unsigned long long end;`
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00			`ssize_t ret;`

			`_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));`

netfs: Fix missing zero-length check in unbuffered write Fix netfs_unbuffered_write_iter() to return immediately if generic_write_checks() returns 0, indicating there's nothing to write. Note that netfs_file_write_iter() already does this. Also, whilst we're at it, put in checks for the size being zero before we even take the locks. Note that generic_write_checks() can still reduce the size to zero, so we still need that check. Without this, a warning similar to the following is logged to dmesg: netfs: Zero-sized write [R=1b6da] and the syscall fails with EIO, e.g.: /sbin/ldconfig.real: Writing of cache extension data failed: Input/output error This can be reproduced on 9p by: xfs_io -f -c 'pwrite 0 0' /xfstest.test/foo Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Reported-by: Eric Van Hensbergen <ericvh@kernel.org> Link: https://lore.kernel.org/r/ZbQUU6QKmIftKsmo@FV7GG9FTHL/ Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20240129094924.1221977-3-dhowells@redhat.com Tested-by: Dominique Martinet <asmadeus@codewreck.org> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Jeff Layton <jlayton@kernel.org> cc: <v9fs@lists.linux.dev> cc: <linux_oss@crudebyte.com> cc: <netfs@lists.linux.dev> cc: <linux-fsdevel@vger.kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org> 2024-01-29 09:49:19 +00:00			`if (!iov_iter_count(from))`
			`return 0;`

netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00			`trace_netfs_write_iter(iocb, from);`
netfs: Count DIO writes Provide a counter for DIO writes to match that for DIO reads. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2024-01-05 14:55:52 +00:00			`netfs_stat(&netfs_n_rh_dio_write);`
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00
			`ret = netfs_start_io_direct(inode);`
			`if (ret < 0)`
			`return ret;`
			`ret = generic_write_checks(iocb, from);`
netfs: Fix missing zero-length check in unbuffered write Fix netfs_unbuffered_write_iter() to return immediately if generic_write_checks() returns 0, indicating there's nothing to write. Note that netfs_file_write_iter() already does this. Also, whilst we're at it, put in checks for the size being zero before we even take the locks. Note that generic_write_checks() can still reduce the size to zero, so we still need that check. Without this, a warning similar to the following is logged to dmesg: netfs: Zero-sized write [R=1b6da] and the syscall fails with EIO, e.g.: /sbin/ldconfig.real: Writing of cache extension data failed: Input/output error This can be reproduced on 9p by: xfs_io -f -c 'pwrite 0 0' /xfstest.test/foo Fixes: 153a9961b551 ("netfs: Implement unbuffered/DIO write support") Reported-by: Eric Van Hensbergen <ericvh@kernel.org> Link: https://lore.kernel.org/r/ZbQUU6QKmIftKsmo@FV7GG9FTHL/ Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20240129094924.1221977-3-dhowells@redhat.com Tested-by: Dominique Martinet <asmadeus@codewreck.org> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Jeff Layton <jlayton@kernel.org> cc: <v9fs@lists.linux.dev> cc: <linux_oss@crudebyte.com> cc: <netfs@lists.linux.dev> cc: <linux-fsdevel@vger.kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org> 2024-01-29 09:49:19 +00:00			`if (ret <= 0)`
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00			`goto out;`
			`ret = file_remove_privs(file);`
			`if (ret < 0)`
			`goto out;`
			`ret = file_update_time(file);`
			`if (ret < 0)`
			`goto out;`
			`ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));`
			`if (ret < 0)`
			`goto out;`
netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells <dhowells@redhat.com> cc: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2023-11-24 13:39:02 +00:00			`end = iocb->ki_pos + iov_iter_count(from);`
			`if (end > ictx->zero_point)`
			`ictx->zero_point = end;`
netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org 2022-02-21 11:38:17 +00:00
			`fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),`
			`FSCACHE_INVAL_DIO_WRITE);`
			`ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);`
			`out:`
			`netfs_end_io_direct(inode);`
			`return ret;`
			`}`
			`EXPORT_SYMBOL(netfs_unbuffered_write_iter);`