linux-stable/include/linux/uio.h
Linus Torvalds a50026bdb8
iov_iter: get rid of 'copy_mc' flag
This flag is only set by one single user: the magical core dumping code
that looks up user pages one by one, and then writes them out using
their kernel addresses (by using a BVEC_ITER).

That actually ends up being a huge problem, because while we do use
copy_mc_to_kernel() for this case and it is able to handle the possible
machine checks involved, nothing else is really ready to handle the
failures caused by the machine check.

In particular, as reported by Tong Tiangen, we don't actually support
fault_in_iov_iter_readable() on a machine check area.

As a result, the usual logic for writing things to a file under a
filesystem lock, which involves doing a copy with page faults disabled
and then if that fails trying to fault pages in without holding the
locks with fault_in_iov_iter_readable() does not work at all.

We could decide to always just make the MC copy "succeed" (and filling
the destination with zeroes), and that would then create a core dump
file that just ignores any machine checks.

But honestly, this single special case has been problematic before, and
means that all the normal iov_iter code ends up slightly more complex
and slower.

See for example commit c9eec08bac ("iov_iter: Don't deal with
iter->copy_mc in memcpy_from_iter_mc()") where David Howells
re-organized the code just to avoid having to check the 'copy_mc' flags
inside the inner iov_iter loops.

So considering that we have exactly one user, and that one user is a
non-critical special case that doesn't actually ever trigger in real
life (Tong found this with manual error injection), the sane solution is
to just decide that the onus on handling the machine check lines on that
user instead.

Ergo, do the copy_mc_to_kernel() in the core dump logic itself, copying
the user data to a stable kernel page before writing it out.

Fixes: f1982740f5 ("iov_iter: Convert iterate*() to inline funcs")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Link: https://lore.kernel.org/r/20240305133336.3804360-1-tongtiangen@huawei.com
Link: https://lore.kernel.org/all/4e80924d-9c85-f13a-722a-6a5d2b1c225a@huawei.com/
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reported-by: Tong Tiangen <tongtiangen@huawei.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-03-06 10:52:12 +01:00

384 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Berkeley style UIO structures - Alan Cox 1994.
*/
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H
#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <linux/mm_types.h>
#include <uapi/linux/uio.h>
struct page;
typedef unsigned int __bitwise iov_iter_extraction_t;
struct kvec {
void *iov_base; /* and that should *never* hold a userland pointer */
size_t iov_len;
};
enum iter_type {
/* iter types */
ITER_UBUF,
ITER_IOVEC,
ITER_BVEC,
ITER_KVEC,
ITER_XARRAY,
ITER_DISCARD,
};
#define ITER_SOURCE 1 // == WRITE
#define ITER_DEST 0 // == READ
struct iov_iter_state {
size_t iov_offset;
size_t count;
unsigned long nr_segs;
};
struct iov_iter {
u8 iter_type;
bool nofault;
bool data_source;
size_t iov_offset;
/*
* Hack alert: overlay ubuf_iovec with iovec + count, so
* that the members resolve correctly regardless of the type
* of iterator used. This means that you can use:
*
* &iter->__ubuf_iovec or iter->__iov
*
* interchangably for the user_backed cases, hence simplifying
* some of the cases that need to deal with both.
*/
union {
/*
* This really should be a const, but we cannot do that without
* also modifying any of the zero-filling iter init functions.
* Leave it non-const for now, but it should be treated as such.
*/
struct iovec __ubuf_iovec;
struct {
union {
/* use iter_iov() to get the current vec */
const struct iovec *__iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
void __user *ubuf;
};
size_t count;
};
};
union {
unsigned long nr_segs;
loff_t xarray_start;
};
};
static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
if (iter->iter_type == ITER_UBUF)
return (const struct iovec *) &iter->__ubuf_iovec;
return iter->__iov;
}
#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset)
static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
return i->iter_type;
}
static inline void iov_iter_save_state(struct iov_iter *iter,
struct iov_iter_state *state)
{
state->iov_offset = iter->iov_offset;
state->count = iter->count;
state->nr_segs = iter->nr_segs;
}
static inline bool iter_is_ubuf(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_UBUF;
}
static inline bool iter_is_iovec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_IOVEC;
}
static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_KVEC;
}
static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_BVEC;
}
static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_DISCARD;
}
static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_XARRAY;
}
static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
return i->data_source ? WRITE : READ;
}
static inline bool user_backed_iter(const struct iov_iter *i)
{
return iter_is_ubuf(i) || iter_is_iovec(i);
}
/*
* Total number of bytes covered by an iovec.
*
* NOTE that it is not safe to use this function until all the iovec's
* segment lengths have been validated. Because the individual lengths can
* overflow a size_t when added together.
*/
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
unsigned long seg;
size_t ret = 0;
for (seg = 0; seg < nr_segs; seg++)
ret += iov[seg].iov_len;
return ret;
}
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
size_t bytes, struct iov_iter *i)
{
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
size_t offset, size_t bytes, struct iov_iter *i)
{
return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if (check_copy_size(addr, bytes, true))
return _copy_to_iter(addr, bytes, i);
return 0;
}
static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
if (check_copy_size(addr, bytes, false))
return _copy_from_iter(addr, bytes, i);
return 0;
}
static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
size_t copied = copy_from_iter(addr, bytes, i);
if (likely(copied == bytes))
return true;
iov_iter_revert(i, copied);
return false;
}
static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
if (check_copy_size(addr, bytes, false))
return _copy_from_iter_nocache(addr, bytes, i);
return 0;
}
static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
size_t copied = copy_from_iter_nocache(addr, bytes, i);
if (likely(copied == bytes))
return true;
iov_iter_revert(i, copied);
return false;
}
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
* Note, users like pmem that depend on the stricter semantics of
* _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
* IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
* destination is flushed from the cache on return.
*/
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif
#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
unsigned len_mask);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
static inline size_t iov_iter_count(const struct iov_iter *i)
{
return i->count;
}
/*
* Cap the iov_iter by given limit; note that the second argument is
* *not* the new size - it's upper limit for such. Passing it a value
* greater than the amount of data in iov_iter is fine - it'll just do
* nothing in that case.
*/
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
/*
* count doesn't have to fit in size_t - comparison extends both
* operands to u64 here and any value that would be truncated by
* conversion in assignement is by definition greater than all
* values of size_t, including old i->count.
*/
if (i->count > count)
i->count = count;
}
/*
* reexpand a previously truncated iterator; count must be no more than how much
* we had shrunk it.
*/
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
i->count = count;
}
static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
size_t shorted = 0;
int npages;
if (iov_iter_count(i) > max_bytes) {
shorted = iov_iter_count(i) - max_bytes;
iov_iter_truncate(i, max_bytes);
}
npages = iov_iter_npages(i, maxpages);
if (shorted)
iov_iter_reexpand(i, iov_iter_count(i) + shorted);
return npages;
}
struct iovec *iovec_from_user(const struct iovec __user *uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
void __user *buf, size_t count)
{
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_UBUF,
.data_source = direction,
.ubuf = buf,
.count = count,
.nr_segs = 1
};
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01)
ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
size_t maxsize, unsigned int maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0);
/**
* iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
* @iter: The iterator
*
* Examine the iterator and indicate by returning true or false as to how, if
* at all, pages extracted from the iterator will be retained by the extraction
* function.
*
* %true indicates that the pages will have a pin placed in them that the
* caller must unpin. This is must be done for DMA/async DIO to force fork()
* to forcibly copy a page for the child (the parent must retain the original
* page).
*
* %false indicates that no measures are taken and that it's up to the caller
* to retain the pages.
*/
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
return user_backed_iter(iter);
}
struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
struct sg_table *sgtable, unsigned int sg_max,
iov_iter_extraction_t extraction_flags);
#endif